diff --git a/engineering/terraform-patterns/.claude-plugin/plugin.json b/engineering/terraform-patterns/.claude-plugin/plugin.json new file mode 100644 index 0000000..f65c6d7 --- /dev/null +++ b/engineering/terraform-patterns/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "terraform-patterns", + "description": "Terraform infrastructure-as-code agent skill and plugin for module design patterns, state management strategies, provider configuration, security hardening, and CI/CD plan/apply workflows. Covers mono-repo vs multi-repo, workspaces, policy-as-code, and drift detection.", + "version": "1.0.0", + "author": { + "name": "Alireza Rezvani", + "url": "https://alirezarezvani.com" + }, + "homepage": "https://github.com/alirezarezvani/claude-skills/tree/main/engineering/terraform-patterns", + "repository": "https://github.com/alirezarezvani/claude-skills", + "license": "MIT", + "skills": "./" +} diff --git a/engineering/terraform-patterns/SKILL.md b/engineering/terraform-patterns/SKILL.md new file mode 100644 index 0000000..6c5c790 --- /dev/null +++ b/engineering/terraform-patterns/SKILL.md @@ -0,0 +1,487 @@ +--- +name: "terraform-patterns" +description: "Terraform infrastructure-as-code agent skill and plugin for Claude Code, Codex, Gemini CLI, Cursor, OpenClaw. Covers module design patterns, state management strategies, provider configuration, security hardening, policy-as-code with Sentinel/OPA, and CI/CD plan/apply workflows. Use when: user wants to design Terraform modules, manage state backends, review Terraform security, implement multi-region deployments, or follow IaC best practices." +license: MIT +metadata: + version: 1.0.0 + author: Alireza Rezvani + category: engineering + updated: 2026-03-15 +--- + +# Terraform Patterns + +> Predictable infrastructure. Secure state. Modules that compose. No drift. + +Opinionated Terraform workflow that turns sprawling HCL into well-structured, secure, production-grade infrastructure code. Covers module design, state management, provider patterns, security hardening, and CI/CD integration. + +Not a Terraform tutorial — a set of concrete decisions about how to write infrastructure code that doesn't break at 3 AM. + +--- + +## Slash Commands + +| Command | What it does | +|---------|-------------| +| `/terraform:review` | Analyze Terraform code for anti-patterns, security issues, and structure problems | +| `/terraform:module` | Design or refactor a Terraform module with proper inputs, outputs, and composition | +| `/terraform:security` | Audit Terraform code for security vulnerabilities, secrets exposure, and IAM misconfigurations | + +--- + +## When This Skill Activates + +Recognize these patterns from the user: + +- "Review this Terraform code" +- "Design a Terraform module for..." +- "My Terraform state is..." +- "Set up remote state backend" +- "Multi-region Terraform deployment" +- "Terraform security review" +- "Module structure best practices" +- "Terraform CI/CD pipeline" +- Any request involving: `.tf` files, HCL, Terraform modules, state management, provider configuration, infrastructure-as-code + +If the user has `.tf` files or wants to provision infrastructure with Terraform → this skill applies. + +--- + +## Workflow + +### `/terraform:review` — Terraform Code Review + +1. **Analyze current state** + - Read all `.tf` files in the target directory + - Identify module structure (flat vs nested) + - Count resources, data sources, variables, outputs + - Check naming conventions + +2. **Apply review checklist** + + ``` + MODULE STRUCTURE + ├── Variables have descriptions and type constraints + ├── Outputs expose only what consumers need + ├── Resources use consistent naming: {provider}_{type}_{purpose} + ├── Locals used for computed values and DRY expressions + └── No hardcoded values — everything parameterized or in locals + + STATE & BACKEND + ├── Remote backend configured (S3, GCS, Azure Blob, Terraform Cloud) + ├── State locking enabled (DynamoDB for S3, native for others) + ├── State encryption at rest enabled + ├── No secrets stored in state (or state access is restricted) + └── Workspaces or directory isolation for environments + + PROVIDERS + ├── Version constraints use pessimistic operator: ~> 5.0 + ├── Required providers block in terraform {} block + ├── Provider aliases for multi-region or multi-account + └── No provider configuration in child modules + + SECURITY + ├── No hardcoded secrets, keys, or passwords + ├── IAM follows least-privilege principle + ├── Encryption enabled for storage, databases, secrets + ├── Security groups are not overly permissive (no 0.0.0.0/0 ingress on sensitive ports) + └── Sensitive variables marked with sensitive = true + ``` + +3. **Generate report** + ```bash + python3 scripts/tf_module_analyzer.py ./terraform + ``` + +4. **Run security scan** + ```bash + python3 scripts/tf_security_scanner.py ./terraform + ``` + +### `/terraform:module` — Module Design + +1. **Identify module scope** + - Single responsibility: one module = one logical grouping + - Determine inputs (variables), outputs, and resource boundaries + - Decide: flat module (single directory) vs nested (calling child modules) + +2. **Apply module design checklist** + + ``` + STRUCTURE + ├── main.tf — Primary resources + ├── variables.tf — All input variables with descriptions and types + ├── outputs.tf — All outputs with descriptions + ├── versions.tf — terraform {} block with required_providers + ├── locals.tf — Computed values and naming conventions + ├── data.tf — Data sources (if any) + └── README.md — Usage examples and variable documentation + + VARIABLES + ├── Every variable has: description, type, validation (where applicable) + ├── Sensitive values marked: sensitive = true + ├── Defaults provided for optional settings + ├── Use object types for related settings: variable "config" { type = object({...}) } + └── Validate with: validation { condition = ... } + + OUTPUTS + ├── Output IDs, ARNs, endpoints — things consumers need + ├── Include description on every output + ├── Mark sensitive outputs: sensitive = true + └── Don't output entire resources — only specific attributes + + COMPOSITION + ├── Root module calls child modules + ├── Child modules never call other child modules + ├── Pass values explicitly — no hidden data source lookups in child modules + ├── Provider configuration only in root module + └── Use module "name" { source = "./modules/name" } + ``` + +3. **Generate module scaffold** + - Output file structure with boilerplate + - Include variable validation blocks + - Add lifecycle rules where appropriate + +### `/terraform:security` — Security Audit + +1. **Code-level audit** + + | Check | Severity | Fix | + |-------|----------|-----| + | Hardcoded secrets in `.tf` files | Critical | Use variables with sensitive = true or vault | + | IAM policy with `*` actions | Critical | Scope to specific actions and resources | + | Security group with 0.0.0.0/0 on port 22/3389 | Critical | Restrict to known CIDR blocks or use SSM/bastion | + | S3 bucket without encryption | High | Add `server_side_encryption_configuration` block | + | S3 bucket with public access | High | Add `aws_s3_bucket_public_access_block` | + | RDS without encryption | High | Set `storage_encrypted = true` | + | RDS publicly accessible | High | Set `publicly_accessible = false` | + | CloudTrail not enabled | Medium | Add `aws_cloudtrail` resource | + | Missing `prevent_destroy` on stateful resources | Medium | Add `lifecycle { prevent_destroy = true }` | + | Variables without `sensitive = true` for secrets | Medium | Add `sensitive = true` to secret variables | + +2. **State security audit** + + | Check | Severity | Fix | + |-------|----------|-----| + | Local state file | Critical | Migrate to remote backend with encryption | + | Remote state without encryption | High | Enable encryption on backend (SSE-S3, KMS) | + | No state locking | High | Enable DynamoDB for S3, native for TF Cloud | + | State accessible to all team members | Medium | Restrict via IAM policies or TF Cloud teams | + +3. **Generate security report** + ```bash + python3 scripts/tf_security_scanner.py ./terraform + python3 scripts/tf_security_scanner.py ./terraform --output json + ``` + +--- + +## Tooling + +### `scripts/tf_module_analyzer.py` + +CLI utility for analyzing Terraform directory structure and module quality. + +**Features:** +- Resource and data source counting +- Variable and output analysis (missing descriptions, types, validation) +- Naming convention checks +- Module composition detection +- File structure validation +- JSON and text output + +**Usage:** +```bash +# Analyze a Terraform directory +python3 scripts/tf_module_analyzer.py ./terraform + +# JSON output +python3 scripts/tf_module_analyzer.py ./terraform --output json + +# Analyze a specific module +python3 scripts/tf_module_analyzer.py ./modules/vpc +``` + +### `scripts/tf_security_scanner.py` + +CLI utility for scanning `.tf` files for common security issues. + +**Features:** +- Hardcoded secret detection (AWS keys, passwords, tokens) +- Overly permissive IAM policy detection +- Open security group detection (0.0.0.0/0 on sensitive ports) +- Missing encryption checks (S3, RDS, EBS) +- Public access detection (S3, RDS, EC2) +- Sensitive variable audit +- JSON and text output + +**Usage:** +```bash +# Scan a Terraform directory +python3 scripts/tf_security_scanner.py ./terraform + +# JSON output +python3 scripts/tf_security_scanner.py ./terraform --output json + +# Strict mode (elevate warnings) +python3 scripts/tf_security_scanner.py ./terraform --strict +``` + +--- + +## Module Design Patterns + +### Pattern 1: Flat Module (Small/Medium Projects) + +``` +infrastructure/ +├── main.tf # All resources +├── variables.tf # All inputs +├── outputs.tf # All outputs +├── versions.tf # Provider requirements +├── terraform.tfvars # Environment values (not committed) +└── backend.tf # Remote state configuration +``` + +Best for: Single application, < 20 resources, one team owns everything. + +### Pattern 2: Nested Modules (Medium/Large Projects) + +``` +infrastructure/ +├── environments/ +│ ├── dev/ +│ │ ├── main.tf # Calls modules with dev params +│ │ ├── backend.tf # Dev state backend +│ │ └── terraform.tfvars +│ ├── staging/ +│ │ └── ... +│ └── prod/ +│ └── ... +├── modules/ +│ ├── networking/ +│ │ ├── main.tf +│ │ ├── variables.tf +│ │ └── outputs.tf +│ ├── compute/ +│ │ └── ... +│ └── database/ +│ └── ... +└── versions.tf +``` + +Best for: Multiple environments, shared infrastructure patterns, team collaboration. + +### Pattern 3: Mono-Repo with Terragrunt + +``` +infrastructure/ +├── terragrunt.hcl # Root config +├── modules/ # Reusable modules +│ ├── vpc/ +│ ├── eks/ +│ └── rds/ +├── dev/ +│ ├── terragrunt.hcl # Dev overrides +│ ├── vpc/ +│ │ └── terragrunt.hcl # Module invocation +│ └── eks/ +│ └── terragrunt.hcl +└── prod/ + ├── terragrunt.hcl + └── ... +``` + +Best for: Large-scale, many environments, DRY configuration, team-level isolation. + +--- + +## Provider Configuration Patterns + +### Version Pinning +```hcl +terraform { + required_version = ">= 1.5.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" # Allow 5.x, block 6.0 + } + random = { + source = "hashicorp/random" + version = "~> 3.5" + } + } +} +``` + +### Multi-Region with Aliases +```hcl +provider "aws" { + region = "us-east-1" +} + +provider "aws" { + alias = "west" + region = "us-west-2" +} + +resource "aws_s3_bucket" "primary" { + bucket = "my-app-primary" +} + +resource "aws_s3_bucket" "replica" { + provider = aws.west + bucket = "my-app-replica" +} +``` + +### Multi-Account with Assume Role +```hcl +provider "aws" { + alias = "production" + region = "us-east-1" + + assume_role { + role_arn = "arn:aws:iam::PROD_ACCOUNT_ID:role/TerraformRole" + } +} +``` + +--- + +## State Management Decision Tree + +``` +Single developer, small project? +├── Yes → Local state (but migrate to remote ASAP) +└── No + ├── Using Terraform Cloud/Enterprise? + │ └── Yes → TF Cloud native backend (built-in locking, encryption, RBAC) + └── No + ├── AWS? + │ └── S3 + DynamoDB (encryption, locking, versioning) + ├── GCP? + │ └── GCS bucket (native locking, encryption) + ├── Azure? + │ └── Azure Blob Storage (native locking, encryption) + └── Other? + └── Consul or PostgreSQL backend + +Environment isolation strategy: +├── Separate state files per environment (recommended) +│ ├── Option A: Separate directories (dev/, staging/, prod/) +│ └── Option B: Terraform workspaces (simpler but less isolation) +└── Single state file for all environments (never do this) +``` + +--- + +## CI/CD Integration Patterns + +### GitHub Actions Plan/Apply + +```yaml +# .github/workflows/terraform.yml +name: Terraform +on: + pull_request: + paths: ['terraform/**'] + push: + branches: [main] + paths: ['terraform/**'] + +jobs: + plan: + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 + - run: terraform init + - run: terraform validate + - run: terraform plan -out=tfplan + - run: terraform show -json tfplan > plan.json + # Post plan as PR comment + + apply: + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + environment: production + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 + - run: terraform init + - run: terraform apply -auto-approve +``` + +### Drift Detection + +```yaml +# Run on schedule to detect drift +name: Drift Detection +on: + schedule: + - cron: '0 6 * * 1-5' # Weekdays at 6 AM + +jobs: + detect: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v3 + - run: terraform init + - run: | + terraform plan -detailed-exitcode -out=drift.tfplan 2>&1 | tee drift.log + EXIT_CODE=$? + if [ $EXIT_CODE -eq 2 ]; then + echo "DRIFT DETECTED — review drift.log" + # Send alert (Slack, PagerDuty, etc.) + fi +``` + +--- + +## Proactive Triggers + +Flag these without being asked: + +- **No remote backend configured** → Migrate to S3/GCS/Azure Blob with locking and encryption. +- **Provider without version constraint** → Add `version = "~> X.0"` to prevent breaking upgrades. +- **Hardcoded secrets in .tf files** → Use variables with `sensitive = true`, or integrate Vault/SSM. +- **IAM policy with `"Action": "*"`** → Scope to specific actions. No wildcard actions in production. +- **Security group open to 0.0.0.0/0 on SSH/RDP** → Restrict to bastion CIDR or use SSM Session Manager. +- **No state locking** → Enable DynamoDB table for S3 backend, or use TF Cloud. +- **Resources without tags** → Add default_tags in provider block. Tags are mandatory for cost tracking. +- **Missing `prevent_destroy` on databases/storage** → Add lifecycle block to prevent accidental deletion. + +--- + +## Installation + +### One-liner (any tool) +```bash +git clone https://github.com/alirezarezvani/claude-skills.git +cp -r claude-skills/engineering/terraform-patterns ~/.claude/skills/ +``` + +### Multi-tool install +```bash +./scripts/convert.sh --skill terraform-patterns --tool codex|gemini|cursor|windsurf|openclaw +``` + +### OpenClaw +```bash +clawhub install terraform-patterns +``` + +--- + +## Related Skills + +- **senior-devops** — Broader DevOps scope (CI/CD, monitoring, containerization). Complementary — use terraform-patterns for IaC-specific work, senior-devops for pipeline and infrastructure operations. +- **aws-solution-architect** — AWS architecture design. Complementary — terraform-patterns implements the infrastructure, aws-solution-architect designs it. +- **senior-security** — Application security. Complementary — terraform-patterns covers infrastructure security posture, senior-security covers application-level threats. +- **ci-cd-pipeline-builder** — Pipeline construction. Complementary — terraform-patterns defines infrastructure, ci-cd-pipeline-builder automates deployment. diff --git a/engineering/terraform-patterns/references/module-patterns.md b/engineering/terraform-patterns/references/module-patterns.md new file mode 100644 index 0000000..56a7ba1 --- /dev/null +++ b/engineering/terraform-patterns/references/module-patterns.md @@ -0,0 +1,409 @@ +# Terraform Module Design Patterns Reference + +## Pattern 1: Flat Module (Single Directory) + +Best for: Small projects, < 20 resources, single team ownership. + +``` +project/ +├── main.tf +├── variables.tf +├── outputs.tf +├── versions.tf +├── locals.tf +├── backend.tf +└── terraform.tfvars +``` + +### Example: Simple VPC + EC2 + +```hcl +# versions.tf +terraform { + required_version = ">= 1.5.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} + +# locals.tf +locals { + name_prefix = "${var.project}-${var.environment}" + common_tags = { + Project = var.project + Environment = var.environment + ManagedBy = "terraform" + } +} + +# main.tf +resource "aws_vpc" "main" { + cidr_block = var.vpc_cidr + enable_dns_hostnames = true + enable_dns_support = true + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-vpc" + }) +} + +resource "aws_subnet" "public" { + count = length(var.public_subnet_cidrs) + vpc_id = aws_vpc.main.id + cidr_block = var.public_subnet_cidrs[count.index] + availability_zone = var.availability_zones[count.index] + + tags = merge(local.common_tags, { + Name = "${local.name_prefix}-public-${count.index + 1}" + Tier = "public" + }) +} + +# variables.tf +variable "project" { + description = "Project name used for resource naming" + type = string +} + +variable "environment" { + description = "Deployment environment" + type = string + validation { + condition = contains(["dev", "staging", "prod"], var.environment) + error_message = "Environment must be dev, staging, or prod." + } +} + +variable "vpc_cidr" { + description = "CIDR block for the VPC" + type = string + default = "10.0.0.0/16" + validation { + condition = can(cidrhost(var.vpc_cidr, 0)) + error_message = "Must be a valid CIDR block." + } +} + +variable "public_subnet_cidrs" { + description = "CIDR blocks for public subnets" + type = list(string) + default = ["10.0.1.0/24", "10.0.2.0/24"] +} + +variable "availability_zones" { + description = "AZs for subnet placement" + type = list(string) + default = ["us-east-1a", "us-east-1b"] +} + +# outputs.tf +output "vpc_id" { + description = "ID of the created VPC" + value = aws_vpc.main.id +} + +output "public_subnet_ids" { + description = "IDs of public subnets" + value = aws_subnet.public[*].id +} +``` + +--- + +## Pattern 2: Nested Modules (Composition) + +Best for: Multiple environments, shared patterns, team collaboration. + +``` +infrastructure/ +├── environments/ +│ ├── dev/ +│ │ ├── main.tf +│ │ ├── backend.tf +│ │ └── terraform.tfvars +│ ├── staging/ +│ │ └── ... +│ └── prod/ +│ └── ... +└── modules/ + ├── networking/ + │ ├── main.tf + │ ├── variables.tf + │ └── outputs.tf + ├── compute/ + │ └── ... + └── database/ + └── ... +``` + +### Root Module (environments/dev/main.tf) + +```hcl +module "networking" { + source = "../../modules/networking" + + project = var.project + environment = "dev" + vpc_cidr = "10.0.0.0/16" + public_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24"] + private_subnet_cidrs = ["10.0.10.0/24", "10.0.11.0/24"] +} + +module "compute" { + source = "../../modules/compute" + + project = var.project + environment = "dev" + vpc_id = module.networking.vpc_id + subnet_ids = module.networking.private_subnet_ids + instance_type = "t3.micro" + instance_count = 1 +} + +module "database" { + source = "../../modules/database" + + project = var.project + environment = "dev" + vpc_id = module.networking.vpc_id + subnet_ids = module.networking.private_subnet_ids + instance_class = "db.t3.micro" + allocated_storage = 20 + db_password = var.db_password +} +``` + +### Key Rules +- Child modules never call other child modules +- Pass values explicitly — no hidden data source lookups in children +- Provider configuration only in root module +- Each module has its own variables.tf, outputs.tf, main.tf + +--- + +## Pattern 3: Registry Module Pattern + +Best for: Reusable modules shared across teams or organizations. + +``` +terraform-aws-vpc/ +├── main.tf +├── variables.tf +├── outputs.tf +├── versions.tf +├── README.md +├── examples/ +│ ├── simple/ +│ │ └── main.tf +│ └── complete/ +│ └── main.tf +└── modules/ + ├── subnet/ + │ ├── main.tf + │ ├── variables.tf + │ └── outputs.tf + └── nat-gateway/ + └── ... +``` + +### Publishing Conventions + +```hcl +# Consumer usage +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = "my-vpc" + cidr = "10.0.0.0/16" + + azs = ["us-east-1a", "us-east-1b"] + private_subnets = ["10.0.1.0/24", "10.0.2.0/24"] + public_subnets = ["10.0.101.0/24", "10.0.102.0/24"] + + enable_nat_gateway = true + single_nat_gateway = true +} +``` + +### Registry Module Requirements +- Repository named `terraform--` +- README.md with usage examples +- Semantic versioning via git tags +- examples/ directory with working configurations +- No provider configuration in the module itself + +--- + +## Pattern 4: Mono-Repo with Workspaces + +Best for: Teams that prefer single-repo with workspace-based isolation. + +```hcl +# backend.tf +terraform { + backend "s3" { + bucket = "my-terraform-state" + key = "project/terraform.tfstate" + region = "us-east-1" + dynamodb_table = "terraform-locks" + encrypt = true + } +} + +# main.tf +locals { + env_config = { + dev = { + instance_type = "t3.micro" + instance_count = 1 + db_class = "db.t3.micro" + } + staging = { + instance_type = "t3.small" + instance_count = 2 + db_class = "db.t3.small" + } + prod = { + instance_type = "t3.large" + instance_count = 3 + db_class = "db.r5.large" + } + } + config = local.env_config[terraform.workspace] +} +``` + +### Usage +```bash +terraform workspace new dev +terraform workspace new staging +terraform workspace new prod + +terraform workspace select dev +terraform apply + +terraform workspace select prod +terraform apply +``` + +### Workspace Caveats +- All environments share the same backend — less isolation than separate directories +- A mistake in the code affects all environments +- Can't have different provider versions per workspace +- Recommended only for simple setups; prefer separate directories for production + +--- + +## Pattern 5: for_each vs count + +### Use `count` for identical resources +```hcl +resource "aws_subnet" "public" { + count = 3 + vpc_id = aws_vpc.main.id + cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index) + availability_zone = data.aws_availability_zones.available.names[count.index] +} +``` + +### Use `for_each` for distinct resources +```hcl +variable "buckets" { + type = map(object({ + versioning = bool + lifecycle_days = number + })) + default = { + logs = { versioning = false, lifecycle_days = 30 } + backups = { versioning = true, lifecycle_days = 90 } + assets = { versioning = true, lifecycle_days = 0 } + } +} + +resource "aws_s3_bucket" "this" { + for_each = var.buckets + bucket = "${var.project}-${each.key}" +} + +resource "aws_s3_bucket_versioning" "this" { + for_each = { for k, v in var.buckets : k => v if v.versioning } + bucket = aws_s3_bucket.this[each.key].id + + versioning_configuration { + status = "Enabled" + } +} +``` + +### Why `for_each` > `count` +- `count` uses index — removing item 0 shifts all others, causing destroy/recreate +- `for_each` uses keys — removing a key only affects that resource +- Use `count` only for identical resources where order doesn't matter + +--- + +## Variable Design Patterns + +### Object Variables for Related Settings +```hcl +variable "database" { + description = "Database configuration" + type = object({ + engine = string + instance_class = string + storage_gb = number + multi_az = bool + backup_days = number + }) + default = { + engine = "postgres" + instance_class = "db.t3.micro" + storage_gb = 20 + multi_az = false + backup_days = 7 + } +} +``` + +### Validation Blocks +```hcl +variable "instance_type" { + description = "EC2 instance type" + type = string + + validation { + condition = can(regex("^t[23]\\.", var.instance_type)) + error_message = "Only t2 or t3 instance types are allowed." + } +} + +variable "cidr_block" { + description = "VPC CIDR block" + type = string + + validation { + condition = can(cidrhost(var.cidr_block, 0)) + error_message = "Must be a valid IPv4 CIDR block." + } +} +``` + +--- + +## Anti-Patterns to Avoid + +| Anti-Pattern | Problem | Solution | +|-------------|---------|----------| +| God module (100+ resources) | Impossible to reason about, slow plan/apply | Split into focused child modules | +| Circular module dependencies | Terraform can't resolve dependency graph | Flatten or restructure module boundaries | +| Data sources in child modules | Hidden dependencies, hard to test | Pass values as variables from root module | +| Provider config in child modules | Can't reuse module across accounts/regions | Configure providers in root only | +| Hardcoded values | Not reusable across environments | Use variables with defaults and validation | +| No outputs | Consumer modules can't reference resources | Output IDs, ARNs, endpoints | +| No variable descriptions | Users don't know what to provide | Every variable gets a description | +| `terraform.tfvars` committed | Secrets leak to version control | Use `.gitignore`, env vars, or Vault | diff --git a/engineering/terraform-patterns/references/state-management.md b/engineering/terraform-patterns/references/state-management.md new file mode 100644 index 0000000..effaa4f --- /dev/null +++ b/engineering/terraform-patterns/references/state-management.md @@ -0,0 +1,472 @@ +# Terraform State Management Reference + +## Backend Configuration Patterns + +### AWS: S3 + DynamoDB (Recommended) + +```hcl +terraform { + backend "s3" { + bucket = "mycompany-terraform-state" + key = "project/env/terraform.tfstate" + region = "us-east-1" + encrypt = true + dynamodb_table = "terraform-locks" + # Optional: KMS key for encryption + # kms_key_id = "arn:aws:kms:us-east-1:ACCOUNT:key/KEY_ID" + } +} +``` + +**Prerequisites:** +```hcl +# Bootstrap these resources manually or with a separate Terraform config +resource "aws_s3_bucket" "state" { + bucket = "mycompany-terraform-state" + + lifecycle { + prevent_destroy = true + } +} + +resource "aws_s3_bucket_versioning" "state" { + bucket = aws_s3_bucket.state.id + versioning_configuration { + status = "Enabled" + } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "state" { + bucket = aws_s3_bucket.state.id + rule { + apply_server_side_encryption_by_default { + sse_algorithm = "aws:kms" + } + } +} + +resource "aws_s3_bucket_public_access_block" "state" { + bucket = aws_s3_bucket.state.id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_dynamodb_table" "locks" { + name = "terraform-locks" + billing_mode = "PAY_PER_REQUEST" + hash_key = "LockID" + + attribute { + name = "LockID" + type = "S" + } +} +``` + +--- + +### GCP: Google Cloud Storage + +```hcl +terraform { + backend "gcs" { + bucket = "mycompany-terraform-state" + prefix = "project/env" + } +} +``` + +**Key features:** +- Native locking (no separate lock table needed) +- Object versioning for state history +- IAM-based access control +- Encryption at rest by default + +--- + +### Azure: Blob Storage + +```hcl +terraform { + backend "azurerm" { + resource_group_name = "terraform-state-rg" + storage_account_name = "mycompanytfstate" + container_name = "tfstate" + key = "project/env/terraform.tfstate" + } +} +``` + +**Key features:** +- Native blob locking +- Encryption at rest with Microsoft-managed or customer-managed keys +- RBAC-based access control + +--- + +### Terraform Cloud / Enterprise + +```hcl +terraform { + cloud { + organization = "mycompany" + workspaces { + name = "project-dev" + } + } +} +``` + +**Key features:** +- Built-in state locking, encryption, and versioning +- RBAC and team-based access control +- Remote execution (plan/apply run in TF Cloud) +- Sentinel policy-as-code integration +- Cost estimation on plans + +--- + +## Environment Isolation Strategies + +### Strategy 1: Separate Directories (Recommended) + +``` +infrastructure/ +├── environments/ +│ ├── dev/ +│ │ ├── main.tf +│ │ ├── backend.tf # key = "project/dev/terraform.tfstate" +│ │ └── terraform.tfvars +│ ├── staging/ +│ │ ├── main.tf +│ │ ├── backend.tf # key = "project/staging/terraform.tfstate" +│ │ └── terraform.tfvars +│ └── prod/ +│ ├── main.tf +│ ├── backend.tf # key = "project/prod/terraform.tfstate" +│ └── terraform.tfvars +└── modules/ + └── ... +``` + +**Pros:** +- Complete isolation — a mistake in dev can't affect prod +- Different provider versions per environment +- Different module versions per environment (pin prod, iterate in dev) +- Clear audit trail — who changed what, where + +**Cons:** +- Some duplication across environment directories +- Must update modules in each environment separately + +### Strategy 2: Terraform Workspaces + +```hcl +# Single directory, multiple workspaces +terraform { + backend "s3" { + bucket = "mycompany-terraform-state" + key = "project/terraform.tfstate" + region = "us-east-1" + dynamodb_table = "terraform-locks" + encrypt = true + } +} + +# State files stored at: +# env:/dev/project/terraform.tfstate +# env:/staging/project/terraform.tfstate +# env:/prod/project/terraform.tfstate +``` + +```bash +terraform workspace new dev +terraform workspace select dev +terraform plan -var-file="env/dev.tfvars" +``` + +**Pros:** +- Less duplication — single set of .tf files +- Quick to switch between environments +- Built-in workspace support in backends + +**Cons:** +- Shared code means a bug affects all environments simultaneously +- Can't have different provider versions per workspace +- Easy to accidentally apply to wrong workspace +- Less isolation than separate directories + +### Strategy 3: Terragrunt (DRY Configuration) + +``` +infrastructure/ +├── terragrunt.hcl # Root — defines remote state pattern +├── modules/ +│ └── vpc/ +│ ├── main.tf +│ ├── variables.tf +│ └── outputs.tf +├── dev/ +│ ├── terragrunt.hcl # env = "dev" +│ └── vpc/ +│ └── terragrunt.hcl # inputs for dev VPC +├── staging/ +│ └── ... +└── prod/ + └── ... +``` + +```hcl +# Root terragrunt.hcl +remote_state { + backend = "s3" + generate = { + path = "backend.tf" + if_exists = "overwrite_terragrunt" + } + config = { + bucket = "mycompany-terraform-state" + key = "${path_relative_to_include()}/terraform.tfstate" + region = "us-east-1" + encrypt = true + dynamodb_table = "terraform-locks" + } +} + +# dev/vpc/terragrunt.hcl +terraform { + source = "../../modules/vpc" +} + +inputs = { + environment = "dev" + vpc_cidr = "10.0.0.0/16" +} +``` + +**Pros:** +- Maximum DRY — define module once, parameterize per environment +- Automatic state key generation from directory structure +- Dependency management between modules (`dependency` blocks) +- `run-all` for applying multiple modules at once + +**Cons:** +- Additional tool dependency (Terragrunt) +- Learning curve +- Debugging can be harder (generated files) + +--- + +## State Migration Patterns + +### Local to Remote (S3) + +```bash +# 1. Add backend configuration to backend.tf +# 2. Run init with migration flag +terraform init -migrate-state + +# Terraform will prompt: +# "Do you want to copy existing state to the new backend?" +# Answer: yes +``` + +### Between Remote Backends + +```bash +# 1. Pull current state +terraform state pull > terraform.tfstate.backup + +# 2. Update backend configuration in backend.tf + +# 3. Reinitialize with migration +terraform init -migrate-state + +# 4. Verify +terraform plan # Should show no changes +``` + +### State Import (Existing Resources) + +```bash +# Import a single resource +terraform import aws_instance.web i-1234567890abcdef0 + +# Import with for_each key +terraform import 'aws_subnet.public["us-east-1a"]' subnet-0123456789abcdef0 + +# Bulk import (Terraform 1.5+ import blocks) +import { + to = aws_instance.web + id = "i-1234567890abcdef0" +} +``` + +### State Move (Refactoring) + +```bash +# Rename a resource (avoids destroy/recreate) +terraform state mv aws_instance.old_name aws_instance.new_name + +# Move into a module +terraform state mv aws_instance.web module.compute.aws_instance.web + +# Move between state files +terraform state mv -state-out=other.tfstate aws_instance.web aws_instance.web +``` + +--- + +## State Locking + +### Why Locking Matters +Without locking, two concurrent `terraform apply` runs can corrupt state. The second apply reads stale state and may create duplicate resources or lose track of existing ones. + +### Lock Behavior by Backend + +| Backend | Lock Mechanism | Auto-Lock | Force Unlock | +|---------|---------------|-----------|--------------| +| S3 | DynamoDB table | Yes (if table configured) | `terraform force-unlock LOCK_ID` | +| GCS | Native blob locking | Yes | `terraform force-unlock LOCK_ID` | +| Azure Blob | Native blob lease | Yes | `terraform force-unlock LOCK_ID` | +| TF Cloud | Built-in | Always | Via UI or API | +| Consul | Key-value lock | Yes | `terraform force-unlock LOCK_ID` | +| Local | `.terraform.lock.hcl` | Yes (single user) | Delete lock file | + +### Force Unlock (Emergency Only) + +```bash +# Only use when you're certain no other process is running +terraform force-unlock LOCK_ID + +# The LOCK_ID is shown in the error message when lock fails: +# Error: Error locking state: Error acquiring the state lock +# Lock Info: +# ID: 12345678-abcd-1234-abcd-1234567890ab +``` + +--- + +## State Security Best Practices + +### 1. Encrypt at Rest +```hcl +# S3 — server-side encryption +backend "s3" { + encrypt = true + kms_key_id = "arn:aws:kms:us-east-1:ACCOUNT:key/KEY_ID" +} +``` + +### 2. Restrict Access +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject" + ], + "Resource": "arn:aws:s3:::mycompany-terraform-state/project/*", + "Condition": { + "StringEquals": { + "aws:PrincipalTag/Team": "platform" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:DeleteItem" + ], + "Resource": "arn:aws:dynamodb:us-east-1:ACCOUNT:table/terraform-locks" + } + ] +} +``` + +### 3. Enable Versioning (State History) +```hcl +resource "aws_s3_bucket_versioning" "state" { + bucket = aws_s3_bucket.state.id + versioning_configuration { + status = "Enabled" + } +} +``` + +Versioning lets you recover from state corruption by restoring a previous version. + +### 4. Audit Access +- Enable S3 access logging or CloudTrail data events +- Monitor for unexpected state reads (potential secret extraction) +- State files contain sensitive values — treat them like credentials + +### 5. Sensitive Values in State +Terraform stores all resource attributes in state, including passwords, private keys, and tokens. This is unavoidable. Mitigate by: +- Encrypting state at rest (KMS) +- Restricting state file access (IAM) +- Using `sensitive = true` on variables and outputs (prevents display, not storage) +- Rotating secrets regularly (state contains the value at apply time) + +--- + +## Drift Detection and Reconciliation + +### Detect Drift +```bash +# Plan with detailed exit code +terraform plan -detailed-exitcode +# Exit 0 = no changes +# Exit 1 = error +# Exit 2 = changes detected (drift) +``` + +### Common Drift Sources +| Source | Example | Prevention | +|--------|---------|------------| +| Console changes | Someone edits SG rules in AWS Console | SCPs to restrict console access, or accept and reconcile | +| Auto-scaling | ASG launches instances not in state | Don't manage individual instances; manage ASG | +| External tools | Ansible modifies EC2 tags | Agree on ownership boundaries | +| Dependent resource changes | AMI deregistered | Use data sources to detect, lifecycle ignore_changes | + +### Reconciliation Options +```hcl +# Option 1: Apply to restore desired state +terraform apply + +# Option 2: Refresh state to match reality +terraform apply -refresh-only + +# Option 3: Ignore specific attribute drift +resource "aws_instance" "web" { + lifecycle { + ignore_changes = [tags["LastModifiedBy"], ami] + } +} + +# Option 4: Import the manually-created resource +terraform import aws_security_group_rule.new sg-12345_ingress_tcp_443_443_0.0.0.0/0 +``` + +--- + +## Troubleshooting Checklist + +| Symptom | Likely Cause | Fix | +|---------|-------------|-----| +| "Error acquiring state lock" | Concurrent run or crashed process | Wait for other run to finish, or `force-unlock` | +| "Backend configuration changed" | Backend config modified | Run `terraform init -reconfigure` or `-migrate-state` | +| "Resource already exists" | Resource created outside Terraform | `terraform import` the resource | +| "No matching resource found" | Resource deleted outside Terraform | `terraform state rm` the resource | +| State file growing very large | Too many resources in one state | Split into smaller state files using modules | +| Slow plan/apply | Large state file, many resources | Split state, use `-target` for urgent changes | +| "Provider produced inconsistent result" | Provider bug or API race condition | Retry, or pin provider version | +| Workspace confusion | Applied to wrong workspace | Always check `terraform workspace show` before apply | diff --git a/engineering/terraform-patterns/scripts/tf_module_analyzer.py b/engineering/terraform-patterns/scripts/tf_module_analyzer.py new file mode 100644 index 0000000..cb74581 --- /dev/null +++ b/engineering/terraform-patterns/scripts/tf_module_analyzer.py @@ -0,0 +1,461 @@ +#!/usr/bin/env python3 +""" +terraform-patterns: Terraform Module Analyzer + +Analyze a Terraform directory structure for module quality, resource counts, +naming conventions, and structural best practices. Reports variable/output +coverage, file organization, and actionable recommendations. + +Usage: + python scripts/tf_module_analyzer.py ./terraform + python scripts/tf_module_analyzer.py ./terraform --output json + python scripts/tf_module_analyzer.py ./modules/vpc +""" + +import argparse +import json +import os +import re +import sys +from pathlib import Path + + +# --- Demo Terraform Files --- + +DEMO_FILES = { + "main.tf": """ +resource "aws_instance" "web_server" { + ami = var.ami_id + instance_type = var.instance_type + + tags = { + Name = "web-server" + } +} + +resource "aws_s3_bucket" "data" { + bucket = "my-data-bucket-12345" +} + +resource "aws_security_group" "web" { + name = "web-sg" + + ingress { + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } +} + +data "aws_ami" "ubuntu" { + most_recent = true + owners = ["099720109477"] +} + +module "vpc" { + source = "./modules/vpc" + cidr = var.vpc_cidr +} +""", + "variables.tf": """ +variable "ami_id" { + type = string +} + +variable "instance_type" { + default = "t3.micro" +} + +variable "vpc_cidr" { + description = "CIDR block for the VPC" + type = string + default = "10.0.0.0/16" +} + +variable "environment" { + description = "Deployment environment" + type = string + validation { + condition = contains(["dev", "staging", "prod"], var.environment) + error_message = "Environment must be dev, staging, or prod." + } +} +""", + "outputs.tf": """ +output "instance_id" { + value = aws_instance.web_server.id +} + +output "bucket_arn" { + value = aws_s3_bucket.data.arn + description = "ARN of the data S3 bucket" +} +""", +} + +# --- Naming convention patterns --- + +# Terraform resource naming: lowercase, underscores, alphanumeric +VALID_RESOURCE_NAME = re.compile(r'^[a-z][a-z0-9_]*$') + +# Expected files in a well-structured module +EXPECTED_FILES = { + "main.tf": "Primary resources", + "variables.tf": "Input variables", + "outputs.tf": "Output values", + "versions.tf": "Provider and Terraform version requirements", +} + +OPTIONAL_FILES = { + "locals.tf": "Computed local values", + "data.tf": "Data sources", + "backend.tf": "Remote state backend configuration", + "providers.tf": "Provider configuration", + "README.md": "Module documentation", +} + + +def find_tf_files(directory): + """Find all .tf files in a directory (non-recursive).""" + tf_files = {} + for entry in sorted(os.listdir(directory)): + if entry.endswith(".tf"): + filepath = os.path.join(directory, entry) + with open(filepath, encoding="utf-8") as f: + tf_files[entry] = f.read() + return tf_files + + +def parse_resources(content): + """Extract resource declarations from HCL content.""" + resources = [] + for match in re.finditer( + r'^resource\s+"([^"]+)"\s+"([^"]+)"', content, re.MULTILINE + ): + resources.append({ + "type": match.group(1), + "name": match.group(2), + "provider": match.group(1).split("_")[0], + }) + return resources + + +def parse_data_sources(content): + """Extract data source declarations.""" + sources = [] + for match in re.finditer( + r'^data\s+"([^"]+)"\s+"([^"]+)"', content, re.MULTILINE + ): + sources.append({"type": match.group(1), "name": match.group(2)}) + return sources + + +def parse_variables(content): + """Extract variable declarations with metadata.""" + variables = [] + # Match variable blocks + for match in re.finditer( + r'^variable\s+"([^"]+)"\s*\{(.*?)\n\}', + content, + re.MULTILINE | re.DOTALL, + ): + name = match.group(1) + body = match.group(2) + var = { + "name": name, + "has_description": "description" in body, + "has_type": bool(re.search(r'\btype\s*=', body)), + "has_default": bool(re.search(r'\bdefault\s*=', body)), + "has_validation": "validation" in body, + "is_sensitive": "sensitive" in body and bool( + re.search(r'\bsensitive\s*=\s*true', body) + ), + } + variables.append(var) + return variables + + +def parse_outputs(content): + """Extract output declarations with metadata.""" + outputs = [] + for match in re.finditer( + r'^output\s+"([^"]+)"\s*\{(.*?)\n\}', + content, + re.MULTILINE | re.DOTALL, + ): + name = match.group(1) + body = match.group(2) + out = { + "name": name, + "has_description": "description" in body, + "is_sensitive": "sensitive" in body and bool( + re.search(r'\bsensitive\s*=\s*true', body) + ), + } + outputs.append(out) + return outputs + + +def parse_modules(content): + """Extract module calls.""" + modules = [] + for match in re.finditer( + r'^module\s+"([^"]+)"\s*\{(.*?)\n\}', + content, + re.MULTILINE | re.DOTALL, + ): + name = match.group(1) + body = match.group(2) + source_match = re.search(r'source\s*=\s*"([^"]+)"', body) + source = source_match.group(1) if source_match else "unknown" + modules.append({"name": name, "source": source}) + return modules + + +def check_naming(resources, data_sources): + """Check naming conventions.""" + issues = [] + for r in resources: + if not VALID_RESOURCE_NAME.match(r["name"]): + issues.append({ + "severity": "medium", + "message": f"Resource '{r['type']}.{r['name']}' uses non-standard naming — use lowercase with underscores", + }) + if r["name"].startswith(r["provider"] + "_"): + issues.append({ + "severity": "low", + "message": f"Resource '{r['type']}.{r['name']}' name repeats the provider prefix — redundant", + }) + for d in data_sources: + if not VALID_RESOURCE_NAME.match(d["name"]): + issues.append({ + "severity": "medium", + "message": f"Data source '{d['type']}.{d['name']}' uses non-standard naming", + }) + return issues + + +def check_variables(variables): + """Check variable quality.""" + issues = [] + for v in variables: + if not v["has_description"]: + issues.append({ + "severity": "medium", + "message": f"Variable '{v['name']}' missing description — consumers won't know what to provide", + }) + if not v["has_type"]: + issues.append({ + "severity": "high", + "message": f"Variable '{v['name']}' missing type constraint — accepts any value", + }) + # Check if name suggests a secret + secret_patterns = ["password", "secret", "token", "key", "api_key", "credentials"] + name_lower = v["name"].lower() + if any(p in name_lower for p in secret_patterns) and not v["is_sensitive"]: + issues.append({ + "severity": "high", + "message": f"Variable '{v['name']}' looks like a secret but is not marked sensitive = true", + }) + return issues + + +def check_outputs(outputs): + """Check output quality.""" + issues = [] + for o in outputs: + if not o["has_description"]: + issues.append({ + "severity": "low", + "message": f"Output '{o['name']}' missing description", + }) + return issues + + +def check_file_structure(tf_files): + """Check if expected files are present.""" + issues = [] + filenames = set(tf_files.keys()) + for expected, purpose in EXPECTED_FILES.items(): + if expected not in filenames: + issues.append({ + "severity": "medium" if expected != "versions.tf" else "high", + "message": f"Missing '{expected}' — {purpose}", + }) + return issues + + +def analyze_directory(tf_files): + """Run full analysis on a set of .tf files.""" + all_content = "\n".join(tf_files.values()) + + resources = parse_resources(all_content) + data_sources = parse_data_sources(all_content) + variables = parse_variables(all_content) + outputs = parse_outputs(all_content) + modules = parse_modules(all_content) + + # Collect findings + findings = [] + findings.extend(check_file_structure(tf_files)) + findings.extend(check_naming(resources, data_sources)) + findings.extend(check_variables(variables)) + findings.extend(check_outputs(outputs)) + + # Check for backend configuration + has_backend = any( + re.search(r'\bbackend\s+"', content) + for content in tf_files.values() + ) + if not has_backend: + findings.append({ + "severity": "high", + "message": "No remote backend configured — state is stored locally", + }) + + # Check for terraform required_version + has_tf_version = any( + re.search(r'required_version\s*=', content) + for content in tf_files.values() + ) + if not has_tf_version: + findings.append({ + "severity": "medium", + "message": "No required_version constraint — any Terraform version can be used", + }) + + # Providers in child modules check + for filename, content in tf_files.items(): + if filename not in ("providers.tf", "versions.tf", "backend.tf"): + if re.search(r'^provider\s+"', content, re.MULTILINE): + findings.append({ + "severity": "medium", + "message": f"Provider configuration found in '{filename}' — keep providers in root module only", + }) + + # Sort findings + severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} + findings.sort(key=lambda f: severity_order.get(f["severity"], 4)) + + # Unique providers + providers = sorted(set(r["provider"] for r in resources)) + + return { + "files": sorted(tf_files.keys()), + "file_count": len(tf_files), + "resources": resources, + "resource_count": len(resources), + "data_sources": data_sources, + "data_source_count": len(data_sources), + "variables": variables, + "variable_count": len(variables), + "outputs": outputs, + "output_count": len(outputs), + "modules": modules, + "module_count": len(modules), + "providers": providers, + "findings": findings, + } + + +def generate_report(analysis, output_format="text"): + """Generate analysis report.""" + findings = analysis["findings"] + + # Score + deductions = {"critical": 25, "high": 15, "medium": 5, "low": 2} + score = max(0, 100 - sum(deductions.get(f["severity"], 0) for f in findings)) + + counts = { + "critical": sum(1 for f in findings if f["severity"] == "critical"), + "high": sum(1 for f in findings if f["severity"] == "high"), + "medium": sum(1 for f in findings if f["severity"] == "medium"), + "low": sum(1 for f in findings if f["severity"] == "low"), + } + + result = { + "score": score, + "files": analysis["files"], + "resource_count": analysis["resource_count"], + "data_source_count": analysis["data_source_count"], + "variable_count": analysis["variable_count"], + "output_count": analysis["output_count"], + "module_count": analysis["module_count"], + "providers": analysis["providers"], + "findings": findings, + "finding_counts": counts, + } + + if output_format == "json": + print(json.dumps(result, indent=2)) + return result + + # Text output + print(f"\n{'=' * 60}") + print(f" Terraform Module Analysis Report") + print(f"{'=' * 60}") + print(f" Score: {score}/100") + print(f" Files: {', '.join(analysis['files'])}") + print(f" Providers: {', '.join(analysis['providers']) if analysis['providers'] else 'none detected'}") + print() + print(f" Resources: {analysis['resource_count']} | Data Sources: {analysis['data_source_count']}") + print(f" Variables: {analysis['variable_count']} | Outputs: {analysis['output_count']} | Modules: {analysis['module_count']}") + print() + print(f" Findings: {counts['critical']} critical | {counts['high']} high | {counts['medium']} medium | {counts['low']} low") + print(f"{'─' * 60}") + + for f in findings: + icon = {"critical": "!!!", "high": "!!", "medium": "!", "low": "~"}.get(f["severity"], "?") + print(f"\n {icon} {f['severity'].upper()}") + print(f" {f['message']}") + + if not findings: + print("\n No issues found. Module structure looks good.") + + print(f"\n{'=' * 60}\n") + return result + + +def main(): + parser = argparse.ArgumentParser( + description="terraform-patterns: Terraform module analyzer" + ) + parser.add_argument( + "directory", nargs="?", + help="Path to Terraform directory (omit for demo)", + ) + parser.add_argument( + "--output", "-o", + choices=["text", "json"], + default="text", + help="Output format (default: text)", + ) + args = parser.parse_args() + + if args.directory: + dirpath = Path(args.directory) + if not dirpath.is_dir(): + print(f"Error: Not a directory: {args.directory}", file=sys.stderr) + sys.exit(1) + tf_files = find_tf_files(str(dirpath)) + if not tf_files: + print(f"Error: No .tf files found in {args.directory}", file=sys.stderr) + sys.exit(1) + else: + print("No directory provided. Running demo analysis...\n") + tf_files = DEMO_FILES + + analysis = analyze_directory(tf_files) + generate_report(analysis, args.output) + + +if __name__ == "__main__": + main() diff --git a/engineering/terraform-patterns/scripts/tf_security_scanner.py b/engineering/terraform-patterns/scripts/tf_security_scanner.py new file mode 100644 index 0000000..b65ffa8 --- /dev/null +++ b/engineering/terraform-patterns/scripts/tf_security_scanner.py @@ -0,0 +1,577 @@ +#!/usr/bin/env python3 +""" +terraform-patterns: Terraform Security Scanner + +Scan .tf files for common security issues including hardcoded secrets, +overly permissive IAM policies, open security groups, missing encryption, +and sensitive variable misuse. + +Usage: + python scripts/tf_security_scanner.py ./terraform + python scripts/tf_security_scanner.py ./terraform --output json + python scripts/tf_security_scanner.py ./terraform --strict +""" + +import argparse +import json +import os +import re +import sys +from pathlib import Path + + +# --- Demo Terraform File --- + +DEMO_TF = """ +provider "aws" { + region = "us-east-1" + access_key = "AKIAIOSFODNN7EXAMPLE" + secret_key = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" +} + +variable "db_password" { + type = string + default = "supersecret123" +} + +resource "aws_instance" "web" { + ami = "ami-12345678" + instance_type = "t3.micro" + + tags = { + Name = "web-server" + } +} + +resource "aws_security_group" "web" { + name = "web-sg" + + ingress { + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + from_port = 0 + to_port = 65535 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_iam_policy" "admin" { + name = "admin-policy" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = "*" + Resource = "*" + } + ] + }) +} + +resource "aws_s3_bucket" "data" { + bucket = "my-data-bucket" +} + +resource "aws_db_instance" "main" { + engine = "mysql" + instance_class = "db.t3.micro" + password = "hardcoded-password" + publicly_accessible = true + skip_final_snapshot = true +} +""" + +# --- Security Rules --- + +SECRET_PATTERNS = [ + { + "id": "SEC001", + "name": "aws_access_key", + "severity": "critical", + "pattern": r'(?:access_key|aws_access_key_id)\s*=\s*"(AKIA[A-Z0-9]{16})"', + "message": "AWS access key hardcoded in configuration", + "fix": "Use environment variables, AWS profiles, or IAM roles instead", + }, + { + "id": "SEC002", + "name": "aws_secret_key", + "severity": "critical", + "pattern": r'(?:secret_key|aws_secret_access_key)\s*=\s*"[A-Za-z0-9/+=]{40}"', + "message": "AWS secret key hardcoded in configuration", + "fix": "Use environment variables, AWS profiles, or IAM roles instead", + }, + { + "id": "SEC003", + "name": "generic_password", + "severity": "critical", + "pattern": r'(?:password|passwd)\s*=\s*"[^"]{4,}"', + "message": "Password hardcoded in resource or provider configuration", + "fix": "Use a variable with sensitive = true, or fetch from Vault/SSM/Secrets Manager", + }, + { + "id": "SEC004", + "name": "generic_secret", + "severity": "critical", + "pattern": r'(?:secret|token|api_key)\s*=\s*"[^"]{8,}"', + "message": "Secret or token hardcoded in configuration", + "fix": "Use a sensitive variable or secrets manager", + }, + { + "id": "SEC005", + "name": "private_key", + "severity": "critical", + "pattern": r'-----BEGIN (?:RSA |EC |DSA )?PRIVATE KEY-----', + "message": "Private key embedded in Terraform configuration", + "fix": "Reference key file with file() function or use secrets manager", + }, +] + +IAM_PATTERNS = [ + { + "id": "SEC010", + "name": "iam_wildcard_action", + "severity": "critical", + "pattern": r'Action\s*=\s*"\*"', + "message": "IAM policy with wildcard Action = \"*\" — grants all permissions", + "fix": "Scope Action to specific services and operations", + }, + { + "id": "SEC011", + "name": "iam_wildcard_resource", + "severity": "high", + "pattern": r'Resource\s*=\s*"\*"', + "message": "IAM policy with wildcard Resource = \"*\" — applies to all resources", + "fix": "Scope Resource to specific ARN patterns", + }, + { + "id": "SEC012", + "name": "iam_star_star", + "severity": "critical", + "pattern": r'Action\s*=\s*"\*"[^}]*Resource\s*=\s*"\*"', + "message": "IAM policy with Action=* AND Resource=* — effectively admin access", + "fix": "Follow least-privilege: grant only the specific actions and resources needed", + }, +] + +NETWORK_PATTERNS = [ + { + "id": "SEC020", + "name": "sg_ssh_open", + "severity": "critical", + "pattern": None, # Custom check + "message": "Security group allows SSH (port 22) from 0.0.0.0/0", + "fix": "Restrict to known CIDR blocks, or use SSM Session Manager instead", + }, + { + "id": "SEC021", + "name": "sg_rdp_open", + "severity": "critical", + "pattern": None, # Custom check + "message": "Security group allows RDP (port 3389) from 0.0.0.0/0", + "fix": "Restrict to known CIDR blocks, or use a bastion host", + }, + { + "id": "SEC022", + "name": "sg_all_ports", + "severity": "critical", + "pattern": None, # Custom check + "message": "Security group allows all ports (0-65535) from 0.0.0.0/0", + "fix": "Open only the specific ports your application needs", + }, +] + +ENCRYPTION_PATTERNS = [ + { + "id": "SEC030", + "name": "s3_no_encryption", + "severity": "high", + "pattern": None, # Custom check + "message": "S3 bucket without server-side encryption configuration", + "fix": "Add aws_s3_bucket_server_side_encryption_configuration resource", + }, + { + "id": "SEC031", + "name": "rds_no_encryption", + "severity": "high", + "pattern": None, # Custom check + "message": "RDS instance without storage encryption", + "fix": "Set storage_encrypted = true on aws_db_instance", + }, + { + "id": "SEC032", + "name": "ebs_no_encryption", + "severity": "medium", + "pattern": None, # Custom check + "message": "EBS volume without encryption", + "fix": "Set encrypted = true on aws_ebs_volume or enable account-level default encryption", + }, +] + +ACCESS_PATTERNS = [ + { + "id": "SEC040", + "name": "rds_public", + "severity": "high", + "pattern": r'publicly_accessible\s*=\s*true', + "message": "RDS instance is publicly accessible", + "fix": "Set publicly_accessible = false and access via VPC/bastion", + }, + { + "id": "SEC041", + "name": "s3_public_acl", + "severity": "high", + "pattern": r'acl\s*=\s*"public-read(?:-write)?"', + "message": "S3 bucket with public ACL", + "fix": "Remove public ACL and add aws_s3_bucket_public_access_block", + }, +] + + +def find_tf_files(directory): + """Find all .tf files in a directory (non-recursive).""" + tf_files = {} + for entry in sorted(os.listdir(directory)): + if entry.endswith(".tf"): + filepath = os.path.join(directory, entry) + with open(filepath, encoding="utf-8") as f: + tf_files[entry] = f.read() + return tf_files + + +def check_regex_rules(content, rules): + """Run regex-based security rules against content.""" + findings = [] + for rule in rules: + if rule["pattern"] is None: + continue + for match in re.finditer(rule["pattern"], content, re.MULTILINE | re.IGNORECASE): + findings.append({ + "id": rule["id"], + "severity": rule["severity"], + "message": rule["message"], + "fix": rule["fix"], + "line": match.group(0).strip()[:80], + }) + return findings + + +def check_security_groups(content): + """Custom check for open security groups.""" + findings = [] + + # Parse ingress blocks within security group resources + sg_blocks = re.finditer( + r'resource\s+"aws_security_group"[^{]*\{(.*?)\n\}', + content, + re.DOTALL, + ) + + for sg_match in sg_blocks: + sg_body = sg_match.group(1) + ingress_blocks = re.finditer( + r'ingress\s*\{(.*?)\}', sg_body, re.DOTALL + ) + + for ingress in ingress_blocks: + block = ingress.group(1) + has_open_cidr = '0.0.0.0/0' in block or '::/0' in block + + if not has_open_cidr: + continue + + from_port_match = re.search(r'from_port\s*=\s*(\d+)', block) + to_port_match = re.search(r'to_port\s*=\s*(\d+)', block) + + if from_port_match and to_port_match: + from_port = int(from_port_match.group(1)) + to_port = int(to_port_match.group(1)) + + # SSH open + if from_port <= 22 <= to_port: + rule = next(r for r in NETWORK_PATTERNS if r["id"] == "SEC020") + findings.append({ + "id": rule["id"], + "severity": rule["severity"], + "message": rule["message"], + "fix": rule["fix"], + "line": f"ingress port 22, cidr 0.0.0.0/0", + }) + + # RDP open + if from_port <= 3389 <= to_port: + rule = next(r for r in NETWORK_PATTERNS if r["id"] == "SEC021") + findings.append({ + "id": rule["id"], + "severity": rule["severity"], + "message": rule["message"], + "fix": rule["fix"], + "line": f"ingress port 3389, cidr 0.0.0.0/0", + }) + + # All ports open + if from_port == 0 and to_port >= 65535: + rule = next(r for r in NETWORK_PATTERNS if r["id"] == "SEC022") + findings.append({ + "id": rule["id"], + "severity": rule["severity"], + "message": rule["message"], + "fix": rule["fix"], + "line": f"ingress ports 0-65535, cidr 0.0.0.0/0", + }) + + return findings + + +def check_encryption(content): + """Custom check for missing encryption on storage resources.""" + findings = [] + + # S3 buckets without encryption + s3_buckets = re.findall( + r'resource\s+"aws_s3_bucket"\s+"([^"]+)"', content + ) + s3_encryption = re.findall( + r'resource\s+"aws_s3_bucket_server_side_encryption_configuration"', content + ) + # Also check inline encryption (older format) + inline_encryption = re.findall( + r'server_side_encryption_configuration', content + ) + if s3_buckets and not s3_encryption and not inline_encryption: + rule = next(r for r in ENCRYPTION_PATTERNS if r["id"] == "SEC030") + for bucket in s3_buckets: + findings.append({ + "id": rule["id"], + "severity": rule["severity"], + "message": f"{rule['message']} (bucket: {bucket})", + "fix": rule["fix"], + "line": f'aws_s3_bucket.{bucket}', + }) + + # RDS without encryption + rds_blocks = re.finditer( + r'resource\s+"aws_db_instance"\s+"([^"]+)"\s*\{(.*?)\n\}', + content, + re.DOTALL, + ) + for rds_match in rds_blocks: + name = rds_match.group(1) + body = rds_match.group(2) + if 'storage_encrypted' not in body or re.search( + r'storage_encrypted\s*=\s*false', body + ): + rule = next(r for r in ENCRYPTION_PATTERNS if r["id"] == "SEC031") + findings.append({ + "id": rule["id"], + "severity": rule["severity"], + "message": f"{rule['message']} (instance: {name})", + "fix": rule["fix"], + "line": f'aws_db_instance.{name}', + }) + + # EBS volumes without encryption + ebs_blocks = re.finditer( + r'resource\s+"aws_ebs_volume"\s+"([^"]+)"\s*\{(.*?)\n\}', + content, + re.DOTALL, + ) + for ebs_match in ebs_blocks: + name = ebs_match.group(1) + body = ebs_match.group(2) + if 'encrypted' not in body or re.search( + r'encrypted\s*=\s*false', body + ): + rule = next(r for r in ENCRYPTION_PATTERNS if r["id"] == "SEC032") + findings.append({ + "id": rule["id"], + "severity": rule["severity"], + "message": f"{rule['message']} (volume: {name})", + "fix": rule["fix"], + "line": f'aws_ebs_volume.{name}', + }) + + return findings + + +def check_sensitive_variables(content): + """Check if variables that look like secrets are marked sensitive.""" + findings = [] + var_blocks = re.finditer( + r'variable\s+"([^"]+)"\s*\{(.*?)\n\}', + content, + re.DOTALL, + ) + secret_names = ["password", "secret", "token", "api_key", "private_key", "credentials"] + + for var_match in var_blocks: + name = var_match.group(1) + body = var_match.group(2) + name_lower = name.lower() + + if any(s in name_lower for s in secret_names): + if not re.search(r'sensitive\s*=\s*true', body): + findings.append({ + "id": "SEC050", + "severity": "medium", + "message": f"Variable '{name}' appears to be a secret but is not marked sensitive = true", + "fix": "Add sensitive = true to prevent the value from appearing in logs and plan output", + "line": f'variable "{name}"', + }) + + # Check for hardcoded default + default_match = re.search(r'default\s*=\s*"([^"]+)"', body) + if default_match and len(default_match.group(1)) > 0: + findings.append({ + "id": "SEC051", + "severity": "critical", + "message": f"Variable '{name}' has a hardcoded default value for a secret", + "fix": "Remove the default value — require it to be passed at runtime via tfvars or env", + "line": f'variable "{name}" default = "{default_match.group(1)[:20]}..."', + }) + + return findings + + +def scan_content(content, strict=False): + """Run all security checks on content.""" + findings = [] + + findings.extend(check_regex_rules(content, SECRET_PATTERNS)) + findings.extend(check_regex_rules(content, IAM_PATTERNS)) + findings.extend(check_regex_rules(content, ACCESS_PATTERNS)) + findings.extend(check_security_groups(content)) + findings.extend(check_encryption(content)) + findings.extend(check_sensitive_variables(content)) + + if strict: + for f in findings: + if f["severity"] == "medium": + f["severity"] = "high" + elif f["severity"] == "low": + f["severity"] = "medium" + + # Deduplicate by (id, line) + seen = set() + unique = [] + for f in findings: + key = (f["id"], f.get("line", "")) + if key not in seen: + seen.add(key) + unique.append(f) + findings = unique + + # Sort by severity + severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} + findings.sort(key=lambda f: severity_order.get(f["severity"], 4)) + + return findings + + +def generate_report(content, output_format="text", strict=False): + """Generate security scan report.""" + findings = scan_content(content, strict) + + # Score + deductions = {"critical": 25, "high": 15, "medium": 5, "low": 2} + score = max(0, 100 - sum(deductions.get(f["severity"], 0) for f in findings)) + + counts = { + "critical": sum(1 for f in findings if f["severity"] == "critical"), + "high": sum(1 for f in findings if f["severity"] == "high"), + "medium": sum(1 for f in findings if f["severity"] == "medium"), + "low": sum(1 for f in findings if f["severity"] == "low"), + } + + result = { + "score": score, + "findings": findings, + "finding_counts": counts, + "total_findings": len(findings), + } + + if output_format == "json": + print(json.dumps(result, indent=2)) + return result + + # Text output + print(f"\n{'=' * 60}") + print(f" Terraform Security Scan Report") + print(f"{'=' * 60}") + print(f" Score: {score}/100") + print() + print(f" Findings: {counts['critical']} critical | {counts['high']} high | {counts['medium']} medium | {counts['low']} low") + print(f"{'─' * 60}") + + for f in findings: + icon = {"critical": "!!!", "high": "!!", "medium": "!", "low": "~"}.get(f["severity"], "?") + print(f"\n [{f['id']}] {icon} {f['severity'].upper()}") + print(f" {f['message']}") + if f.get("line"): + print(f" Match: {f['line']}") + print(f" Fix: {f['fix']}") + + if not findings: + print("\n No security issues found. Configuration looks clean.") + + print(f"\n{'=' * 60}\n") + return result + + +def main(): + parser = argparse.ArgumentParser( + description="terraform-patterns: Terraform security scanner" + ) + parser.add_argument( + "target", nargs="?", + help="Path to Terraform directory or .tf file (omit for demo)", + ) + parser.add_argument( + "--output", "-o", + choices=["text", "json"], + default="text", + help="Output format (default: text)", + ) + parser.add_argument( + "--strict", + action="store_true", + help="Strict mode — elevate warnings to higher severity", + ) + args = parser.parse_args() + + if args.target: + target = Path(args.target) + if target.is_dir(): + tf_files = find_tf_files(str(target)) + if not tf_files: + print(f"Error: No .tf files found in {args.target}", file=sys.stderr) + sys.exit(1) + content = "\n".join(tf_files.values()) + elif target.is_file() and target.suffix == ".tf": + content = target.read_text(encoding="utf-8") + else: + print(f"Error: {args.target} is not a directory or .tf file", file=sys.stderr) + sys.exit(1) + else: + print("No target provided. Running demo scan...\n") + content = DEMO_TF + + generate_report(content, args.output, args.strict) + + +if __name__ == "__main__": + main()