feat(skills): add terraform-patterns agent skill

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Leo
2026-03-15 23:29:01 +01:00
parent 0c31067556
commit dac49ee9f9
6 changed files with 2419 additions and 0 deletions

View File

@@ -0,0 +1,409 @@
# Terraform Module Design Patterns Reference
## Pattern 1: Flat Module (Single Directory)
Best for: Small projects, < 20 resources, single team ownership.
```
project/
├── main.tf
├── variables.tf
├── outputs.tf
├── versions.tf
├── locals.tf
├── backend.tf
└── terraform.tfvars
```
### Example: Simple VPC + EC2
```hcl
# versions.tf
terraform {
required_version = ">= 1.5.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}
# locals.tf
locals {
name_prefix = "${var.project}-${var.environment}"
common_tags = {
Project = var.project
Environment = var.environment
ManagedBy = "terraform"
}
}
# main.tf
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
enable_dns_hostnames = true
enable_dns_support = true
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-vpc"
})
}
resource "aws_subnet" "public" {
count = length(var.public_subnet_cidrs)
vpc_id = aws_vpc.main.id
cidr_block = var.public_subnet_cidrs[count.index]
availability_zone = var.availability_zones[count.index]
tags = merge(local.common_tags, {
Name = "${local.name_prefix}-public-${count.index + 1}"
Tier = "public"
})
}
# variables.tf
variable "project" {
description = "Project name used for resource naming"
type = string
}
variable "environment" {
description = "Deployment environment"
type = string
validation {
condition = contains(["dev", "staging", "prod"], var.environment)
error_message = "Environment must be dev, staging, or prod."
}
}
variable "vpc_cidr" {
description = "CIDR block for the VPC"
type = string
default = "10.0.0.0/16"
validation {
condition = can(cidrhost(var.vpc_cidr, 0))
error_message = "Must be a valid CIDR block."
}
}
variable "public_subnet_cidrs" {
description = "CIDR blocks for public subnets"
type = list(string)
default = ["10.0.1.0/24", "10.0.2.0/24"]
}
variable "availability_zones" {
description = "AZs for subnet placement"
type = list(string)
default = ["us-east-1a", "us-east-1b"]
}
# outputs.tf
output "vpc_id" {
description = "ID of the created VPC"
value = aws_vpc.main.id
}
output "public_subnet_ids" {
description = "IDs of public subnets"
value = aws_subnet.public[*].id
}
```
---
## Pattern 2: Nested Modules (Composition)
Best for: Multiple environments, shared patterns, team collaboration.
```
infrastructure/
├── environments/
│ ├── dev/
│ │ ├── main.tf
│ │ ├── backend.tf
│ │ └── terraform.tfvars
│ ├── staging/
│ │ └── ...
│ └── prod/
│ └── ...
└── modules/
├── networking/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
├── compute/
│ └── ...
└── database/
└── ...
```
### Root Module (environments/dev/main.tf)
```hcl
module "networking" {
source = "../../modules/networking"
project = var.project
environment = "dev"
vpc_cidr = "10.0.0.0/16"
public_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24"]
private_subnet_cidrs = ["10.0.10.0/24", "10.0.11.0/24"]
}
module "compute" {
source = "../../modules/compute"
project = var.project
environment = "dev"
vpc_id = module.networking.vpc_id
subnet_ids = module.networking.private_subnet_ids
instance_type = "t3.micro"
instance_count = 1
}
module "database" {
source = "../../modules/database"
project = var.project
environment = "dev"
vpc_id = module.networking.vpc_id
subnet_ids = module.networking.private_subnet_ids
instance_class = "db.t3.micro"
allocated_storage = 20
db_password = var.db_password
}
```
### Key Rules
- Child modules never call other child modules
- Pass values explicitly — no hidden data source lookups in children
- Provider configuration only in root module
- Each module has its own variables.tf, outputs.tf, main.tf
---
## Pattern 3: Registry Module Pattern
Best for: Reusable modules shared across teams or organizations.
```
terraform-aws-vpc/
├── main.tf
├── variables.tf
├── outputs.tf
├── versions.tf
├── README.md
├── examples/
│ ├── simple/
│ │ └── main.tf
│ └── complete/
│ └── main.tf
└── modules/
├── subnet/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
└── nat-gateway/
└── ...
```
### Publishing Conventions
```hcl
# Consumer usage
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "~> 5.0"
name = "my-vpc"
cidr = "10.0.0.0/16"
azs = ["us-east-1a", "us-east-1b"]
private_subnets = ["10.0.1.0/24", "10.0.2.0/24"]
public_subnets = ["10.0.101.0/24", "10.0.102.0/24"]
enable_nat_gateway = true
single_nat_gateway = true
}
```
### Registry Module Requirements
- Repository named `terraform-<PROVIDER>-<NAME>`
- README.md with usage examples
- Semantic versioning via git tags
- examples/ directory with working configurations
- No provider configuration in the module itself
---
## Pattern 4: Mono-Repo with Workspaces
Best for: Teams that prefer single-repo with workspace-based isolation.
```hcl
# backend.tf
terraform {
backend "s3" {
bucket = "my-terraform-state"
key = "project/terraform.tfstate"
region = "us-east-1"
dynamodb_table = "terraform-locks"
encrypt = true
}
}
# main.tf
locals {
env_config = {
dev = {
instance_type = "t3.micro"
instance_count = 1
db_class = "db.t3.micro"
}
staging = {
instance_type = "t3.small"
instance_count = 2
db_class = "db.t3.small"
}
prod = {
instance_type = "t3.large"
instance_count = 3
db_class = "db.r5.large"
}
}
config = local.env_config[terraform.workspace]
}
```
### Usage
```bash
terraform workspace new dev
terraform workspace new staging
terraform workspace new prod
terraform workspace select dev
terraform apply
terraform workspace select prod
terraform apply
```
### Workspace Caveats
- All environments share the same backend — less isolation than separate directories
- A mistake in the code affects all environments
- Can't have different provider versions per workspace
- Recommended only for simple setups; prefer separate directories for production
---
## Pattern 5: for_each vs count
### Use `count` for identical resources
```hcl
resource "aws_subnet" "public" {
count = 3
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index)
availability_zone = data.aws_availability_zones.available.names[count.index]
}
```
### Use `for_each` for distinct resources
```hcl
variable "buckets" {
type = map(object({
versioning = bool
lifecycle_days = number
}))
default = {
logs = { versioning = false, lifecycle_days = 30 }
backups = { versioning = true, lifecycle_days = 90 }
assets = { versioning = true, lifecycle_days = 0 }
}
}
resource "aws_s3_bucket" "this" {
for_each = var.buckets
bucket = "${var.project}-${each.key}"
}
resource "aws_s3_bucket_versioning" "this" {
for_each = { for k, v in var.buckets : k => v if v.versioning }
bucket = aws_s3_bucket.this[each.key].id
versioning_configuration {
status = "Enabled"
}
}
```
### Why `for_each` > `count`
- `count` uses index — removing item 0 shifts all others, causing destroy/recreate
- `for_each` uses keys — removing a key only affects that resource
- Use `count` only for identical resources where order doesn't matter
---
## Variable Design Patterns
### Object Variables for Related Settings
```hcl
variable "database" {
description = "Database configuration"
type = object({
engine = string
instance_class = string
storage_gb = number
multi_az = bool
backup_days = number
})
default = {
engine = "postgres"
instance_class = "db.t3.micro"
storage_gb = 20
multi_az = false
backup_days = 7
}
}
```
### Validation Blocks
```hcl
variable "instance_type" {
description = "EC2 instance type"
type = string
validation {
condition = can(regex("^t[23]\\.", var.instance_type))
error_message = "Only t2 or t3 instance types are allowed."
}
}
variable "cidr_block" {
description = "VPC CIDR block"
type = string
validation {
condition = can(cidrhost(var.cidr_block, 0))
error_message = "Must be a valid IPv4 CIDR block."
}
}
```
---
## Anti-Patterns to Avoid
| Anti-Pattern | Problem | Solution |
|-------------|---------|----------|
| God module (100+ resources) | Impossible to reason about, slow plan/apply | Split into focused child modules |
| Circular module dependencies | Terraform can't resolve dependency graph | Flatten or restructure module boundaries |
| Data sources in child modules | Hidden dependencies, hard to test | Pass values as variables from root module |
| Provider config in child modules | Can't reuse module across accounts/regions | Configure providers in root only |
| Hardcoded values | Not reusable across environments | Use variables with defaults and validation |
| No outputs | Consumer modules can't reference resources | Output IDs, ARNs, endpoints |
| No variable descriptions | Users don't know what to provide | Every variable gets a description |
| `terraform.tfvars` committed | Secrets leak to version control | Use `.gitignore`, env vars, or Vault |

View File

@@ -0,0 +1,472 @@
# Terraform State Management Reference
## Backend Configuration Patterns
### AWS: S3 + DynamoDB (Recommended)
```hcl
terraform {
backend "s3" {
bucket = "mycompany-terraform-state"
key = "project/env/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
# Optional: KMS key for encryption
# kms_key_id = "arn:aws:kms:us-east-1:ACCOUNT:key/KEY_ID"
}
}
```
**Prerequisites:**
```hcl
# Bootstrap these resources manually or with a separate Terraform config
resource "aws_s3_bucket" "state" {
bucket = "mycompany-terraform-state"
lifecycle {
prevent_destroy = true
}
}
resource "aws_s3_bucket_versioning" "state" {
bucket = aws_s3_bucket.state.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_server_side_encryption_configuration" "state" {
bucket = aws_s3_bucket.state.id
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "aws:kms"
}
}
}
resource "aws_s3_bucket_public_access_block" "state" {
bucket = aws_s3_bucket.state.id
block_public_acls = true
block_public_policy = true
ignore_public_acls = true
restrict_public_buckets = true
}
resource "aws_dynamodb_table" "locks" {
name = "terraform-locks"
billing_mode = "PAY_PER_REQUEST"
hash_key = "LockID"
attribute {
name = "LockID"
type = "S"
}
}
```
---
### GCP: Google Cloud Storage
```hcl
terraform {
backend "gcs" {
bucket = "mycompany-terraform-state"
prefix = "project/env"
}
}
```
**Key features:**
- Native locking (no separate lock table needed)
- Object versioning for state history
- IAM-based access control
- Encryption at rest by default
---
### Azure: Blob Storage
```hcl
terraform {
backend "azurerm" {
resource_group_name = "terraform-state-rg"
storage_account_name = "mycompanytfstate"
container_name = "tfstate"
key = "project/env/terraform.tfstate"
}
}
```
**Key features:**
- Native blob locking
- Encryption at rest with Microsoft-managed or customer-managed keys
- RBAC-based access control
---
### Terraform Cloud / Enterprise
```hcl
terraform {
cloud {
organization = "mycompany"
workspaces {
name = "project-dev"
}
}
}
```
**Key features:**
- Built-in state locking, encryption, and versioning
- RBAC and team-based access control
- Remote execution (plan/apply run in TF Cloud)
- Sentinel policy-as-code integration
- Cost estimation on plans
---
## Environment Isolation Strategies
### Strategy 1: Separate Directories (Recommended)
```
infrastructure/
├── environments/
│ ├── dev/
│ │ ├── main.tf
│ │ ├── backend.tf # key = "project/dev/terraform.tfstate"
│ │ └── terraform.tfvars
│ ├── staging/
│ │ ├── main.tf
│ │ ├── backend.tf # key = "project/staging/terraform.tfstate"
│ │ └── terraform.tfvars
│ └── prod/
│ ├── main.tf
│ ├── backend.tf # key = "project/prod/terraform.tfstate"
│ └── terraform.tfvars
└── modules/
└── ...
```
**Pros:**
- Complete isolation — a mistake in dev can't affect prod
- Different provider versions per environment
- Different module versions per environment (pin prod, iterate in dev)
- Clear audit trail — who changed what, where
**Cons:**
- Some duplication across environment directories
- Must update modules in each environment separately
### Strategy 2: Terraform Workspaces
```hcl
# Single directory, multiple workspaces
terraform {
backend "s3" {
bucket = "mycompany-terraform-state"
key = "project/terraform.tfstate"
region = "us-east-1"
dynamodb_table = "terraform-locks"
encrypt = true
}
}
# State files stored at:
# env:/dev/project/terraform.tfstate
# env:/staging/project/terraform.tfstate
# env:/prod/project/terraform.tfstate
```
```bash
terraform workspace new dev
terraform workspace select dev
terraform plan -var-file="env/dev.tfvars"
```
**Pros:**
- Less duplication — single set of .tf files
- Quick to switch between environments
- Built-in workspace support in backends
**Cons:**
- Shared code means a bug affects all environments simultaneously
- Can't have different provider versions per workspace
- Easy to accidentally apply to wrong workspace
- Less isolation than separate directories
### Strategy 3: Terragrunt (DRY Configuration)
```
infrastructure/
├── terragrunt.hcl # Root — defines remote state pattern
├── modules/
│ └── vpc/
│ ├── main.tf
│ ├── variables.tf
│ └── outputs.tf
├── dev/
│ ├── terragrunt.hcl # env = "dev"
│ └── vpc/
│ └── terragrunt.hcl # inputs for dev VPC
├── staging/
│ └── ...
└── prod/
└── ...
```
```hcl
# Root terragrunt.hcl
remote_state {
backend = "s3"
generate = {
path = "backend.tf"
if_exists = "overwrite_terragrunt"
}
config = {
bucket = "mycompany-terraform-state"
key = "${path_relative_to_include()}/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
# dev/vpc/terragrunt.hcl
terraform {
source = "../../modules/vpc"
}
inputs = {
environment = "dev"
vpc_cidr = "10.0.0.0/16"
}
```
**Pros:**
- Maximum DRY — define module once, parameterize per environment
- Automatic state key generation from directory structure
- Dependency management between modules (`dependency` blocks)
- `run-all` for applying multiple modules at once
**Cons:**
- Additional tool dependency (Terragrunt)
- Learning curve
- Debugging can be harder (generated files)
---
## State Migration Patterns
### Local to Remote (S3)
```bash
# 1. Add backend configuration to backend.tf
# 2. Run init with migration flag
terraform init -migrate-state
# Terraform will prompt:
# "Do you want to copy existing state to the new backend?"
# Answer: yes
```
### Between Remote Backends
```bash
# 1. Pull current state
terraform state pull > terraform.tfstate.backup
# 2. Update backend configuration in backend.tf
# 3. Reinitialize with migration
terraform init -migrate-state
# 4. Verify
terraform plan # Should show no changes
```
### State Import (Existing Resources)
```bash
# Import a single resource
terraform import aws_instance.web i-1234567890abcdef0
# Import with for_each key
terraform import 'aws_subnet.public["us-east-1a"]' subnet-0123456789abcdef0
# Bulk import (Terraform 1.5+ import blocks)
import {
to = aws_instance.web
id = "i-1234567890abcdef0"
}
```
### State Move (Refactoring)
```bash
# Rename a resource (avoids destroy/recreate)
terraform state mv aws_instance.old_name aws_instance.new_name
# Move into a module
terraform state mv aws_instance.web module.compute.aws_instance.web
# Move between state files
terraform state mv -state-out=other.tfstate aws_instance.web aws_instance.web
```
---
## State Locking
### Why Locking Matters
Without locking, two concurrent `terraform apply` runs can corrupt state. The second apply reads stale state and may create duplicate resources or lose track of existing ones.
### Lock Behavior by Backend
| Backend | Lock Mechanism | Auto-Lock | Force Unlock |
|---------|---------------|-----------|--------------|
| S3 | DynamoDB table | Yes (if table configured) | `terraform force-unlock LOCK_ID` |
| GCS | Native blob locking | Yes | `terraform force-unlock LOCK_ID` |
| Azure Blob | Native blob lease | Yes | `terraform force-unlock LOCK_ID` |
| TF Cloud | Built-in | Always | Via UI or API |
| Consul | Key-value lock | Yes | `terraform force-unlock LOCK_ID` |
| Local | `.terraform.lock.hcl` | Yes (single user) | Delete lock file |
### Force Unlock (Emergency Only)
```bash
# Only use when you're certain no other process is running
terraform force-unlock LOCK_ID
# The LOCK_ID is shown in the error message when lock fails:
# Error: Error locking state: Error acquiring the state lock
# Lock Info:
# ID: 12345678-abcd-1234-abcd-1234567890ab
```
---
## State Security Best Practices
### 1. Encrypt at Rest
```hcl
# S3 — server-side encryption
backend "s3" {
encrypt = true
kms_key_id = "arn:aws:kms:us-east-1:ACCOUNT:key/KEY_ID"
}
```
### 2. Restrict Access
```json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject"
],
"Resource": "arn:aws:s3:::mycompany-terraform-state/project/*",
"Condition": {
"StringEquals": {
"aws:PrincipalTag/Team": "platform"
}
}
},
{
"Effect": "Allow",
"Action": [
"dynamodb:GetItem",
"dynamodb:PutItem",
"dynamodb:DeleteItem"
],
"Resource": "arn:aws:dynamodb:us-east-1:ACCOUNT:table/terraform-locks"
}
]
}
```
### 3. Enable Versioning (State History)
```hcl
resource "aws_s3_bucket_versioning" "state" {
bucket = aws_s3_bucket.state.id
versioning_configuration {
status = "Enabled"
}
}
```
Versioning lets you recover from state corruption by restoring a previous version.
### 4. Audit Access
- Enable S3 access logging or CloudTrail data events
- Monitor for unexpected state reads (potential secret extraction)
- State files contain sensitive values — treat them like credentials
### 5. Sensitive Values in State
Terraform stores all resource attributes in state, including passwords, private keys, and tokens. This is unavoidable. Mitigate by:
- Encrypting state at rest (KMS)
- Restricting state file access (IAM)
- Using `sensitive = true` on variables and outputs (prevents display, not storage)
- Rotating secrets regularly (state contains the value at apply time)
---
## Drift Detection and Reconciliation
### Detect Drift
```bash
# Plan with detailed exit code
terraform plan -detailed-exitcode
# Exit 0 = no changes
# Exit 1 = error
# Exit 2 = changes detected (drift)
```
### Common Drift Sources
| Source | Example | Prevention |
|--------|---------|------------|
| Console changes | Someone edits SG rules in AWS Console | SCPs to restrict console access, or accept and reconcile |
| Auto-scaling | ASG launches instances not in state | Don't manage individual instances; manage ASG |
| External tools | Ansible modifies EC2 tags | Agree on ownership boundaries |
| Dependent resource changes | AMI deregistered | Use data sources to detect, lifecycle ignore_changes |
### Reconciliation Options
```hcl
# Option 1: Apply to restore desired state
terraform apply
# Option 2: Refresh state to match reality
terraform apply -refresh-only
# Option 3: Ignore specific attribute drift
resource "aws_instance" "web" {
lifecycle {
ignore_changes = [tags["LastModifiedBy"], ami]
}
}
# Option 4: Import the manually-created resource
terraform import aws_security_group_rule.new sg-12345_ingress_tcp_443_443_0.0.0.0/0
```
---
## Troubleshooting Checklist
| Symptom | Likely Cause | Fix |
|---------|-------------|-----|
| "Error acquiring state lock" | Concurrent run or crashed process | Wait for other run to finish, or `force-unlock` |
| "Backend configuration changed" | Backend config modified | Run `terraform init -reconfigure` or `-migrate-state` |
| "Resource already exists" | Resource created outside Terraform | `terraform import` the resource |
| "No matching resource found" | Resource deleted outside Terraform | `terraform state rm` the resource |
| State file growing very large | Too many resources in one state | Split into smaller state files using modules |
| Slow plan/apply | Large state file, many resources | Split state, use `-target` for urgent changes |
| "Provider produced inconsistent result" | Provider bug or API race condition | Retry, or pin provider version |
| Workspace confusion | Applied to wrong workspace | Always check `terraform workspace show` before apply |