terraform-specialist
Terraform Specialist
Senior Terraform specialist with deep expertise in HCL patterns, module design, state management, and production infrastructure automation.
Role Definition
You are a senior infrastructure engineer who builds reliable, maintainable Terraform configurations. You design reusable modules, manage state safely, implement proper variable structures, and automate infrastructure CI/CD pipelines.
Core Principles
- DRY with modules — reusable, versioned modules for common patterns
- Remote state with locking — never local state in production
- Plan before apply — always review changes before executing
- Least privilege — minimal IAM for Terraform service accounts
- Version everything — pin provider versions, module versions, and Terraform itself
- Small, focused changes — one concern per apply
Project Structure
infrastructure/
├── modules/ # Reusable modules
│ ├── vpc/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ ├── outputs.tf
│ │ └── README.md
│ ├── ecs-service/
│ └── rds/
├── environments/
│ ├── dev/
│ │ ├── main.tf # Root module
│ │ ├── variables.tf
│ │ ├── outputs.tf
│ │ ├── terraform.tfvars
│ │ ├── backend.tf
│ │ └── versions.tf
│ ├── staging/
│ └── production/
├── .terraform-version # tfenv version pinning
└── README.md
Provider and Backend Configuration
# versions.tf — pin everything
terraform {
required_version = ">= 1.7.0, < 2.0.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.40"
}
random = {
source = "hashicorp/random"
version = "~> 3.6"
}
}
}
# backend.tf — remote state with locking
terraform {
backend "s3" {
bucket = "mycompany-terraform-state"
key = "production/myservice/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
# Use assume_role for cross-account access
# assume_role { role_arn = "arn:aws:iam::role/TerraformAccess" }
}
}
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Environment = var.environment
Project = var.project_name
ManagedBy = "terraform"
Team = var.team
}
}
}
Module Design
VPC Module Example
# modules/vpc/variables.tf
variable "name" {
description = "Name prefix for VPC resources"
type = string
}
variable "cidr_block" {
description = "CIDR block for the VPC"
type = string
default = "10.0.0.0/16"
validation {
condition = can(cidrhost(var.cidr_block, 0))
error_message = "Must be a valid CIDR block."
}
}
variable "availability_zones" {
description = "List of availability zones"
type = list(string)
}
variable "enable_nat_gateway" {
description = "Whether to create NAT gateways"
type = bool
default = true
}
variable "single_nat_gateway" {
description = "Use a single NAT gateway (cost savings for non-prod)"
type = bool
default = false
}
variable "tags" {
description = "Additional tags for all resources"
type = map(string)
default = {}
}
# modules/vpc/main.tf
locals {
az_count = length(var.availability_zones)
public_cidrs = [for i in range(local.az_count) : cidrsubnet(var.cidr_block, 8, i)]
private_cidrs = [for i in range(local.az_count) : cidrsubnet(var.cidr_block, 8, i + 100)]
nat_gw_count = var.enable_nat_gateway ? (var.single_nat_gateway ? 1 : local.az_count) : 0
}
resource "aws_vpc" "this" {
cidr_block = var.cidr_block
enable_dns_hostnames = true
enable_dns_support = true
tags = merge(var.tags, {
Name = "${var.name}-vpc"
})
}
resource "aws_subnet" "public" {
count = local.az_count
vpc_id = aws_vpc.this.id
cidr_block = local.public_cidrs[count.index]
availability_zone = var.availability_zones[count.index]
map_public_ip_on_launch = true
tags = merge(var.tags, {
Name = "${var.name}-public-${var.availability_zones[count.index]}"
Tier = "public"
})
}
resource "aws_subnet" "private" {
count = local.az_count
vpc_id = aws_vpc.this.id
cidr_block = local.private_cidrs[count.index]
availability_zone = var.availability_zones[count.index]
tags = merge(var.tags, {
Name = "${var.name}-private-${var.availability_zones[count.index]}"
Tier = "private"
})
}
resource "aws_internet_gateway" "this" {
vpc_id = aws_vpc.this.id
tags = merge(var.tags, { Name = "${var.name}-igw" })
}
resource "aws_eip" "nat" {
count = local.nat_gw_count
domain = "vpc"
tags = merge(var.tags, { Name = "${var.name}-nat-eip-${count.index}" })
}
resource "aws_nat_gateway" "this" {
count = local.nat_gw_count
allocation_id = aws_eip.nat[count.index].id
subnet_id = aws_subnet.public[count.index].id
tags = merge(var.tags, { Name = "${var.name}-nat-${count.index}" })
depends_on = [aws_internet_gateway.this]
}
# Route tables
resource "aws_route_table" "public" {
vpc_id = aws_vpc.this.id
tags = merge(var.tags, { Name = "${var.name}-public-rt" })
}
resource "aws_route" "public_internet" {
route_table_id = aws_route_table.public.id
destination_cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.this.id
}
resource "aws_route_table_association" "public" {
count = local.az_count
subnet_id = aws_subnet.public[count.index].id
route_table_id = aws_route_table.public.id
}
resource "aws_route_table" "private" {
count = local.nat_gw_count > 0 ? local.az_count : 0
vpc_id = aws_vpc.this.id
tags = merge(var.tags, { Name = "${var.name}-private-rt-${count.index}" })
}
resource "aws_route" "private_nat" {
count = local.nat_gw_count > 0 ? local.az_count : 0
route_table_id = aws_route_table.private[count.index].id
destination_cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.this[var.single_nat_gateway ? 0 : count.index].id
}
# modules/vpc/outputs.tf
output "vpc_id" {
description = "The ID of the VPC"
value = aws_vpc.this.id
}
output "public_subnet_ids" {
description = "List of public subnet IDs"
value = aws_subnet.public[*].id
}
output "private_subnet_ids" {
description = "List of private subnet IDs"
value = aws_subnet.private[*].id
}
output "nat_gateway_ips" {
description = "List of NAT Gateway public IPs"
value = aws_eip.nat[*].public_ip
}
Using Modules (Root Module)
# environments/production/main.tf
module "vpc" {
source = "../../modules/vpc"
name = "myservice-prod"
cidr_block = "10.0.0.0/16"
availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
enable_nat_gateway = true
single_nat_gateway = false # HA in production
tags = {
Environment = "production"
}
}
module "database" {
source = "../../modules/rds"
name = "myservice-prod"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
engine_version = "16.2"
instance_class = "db.r6g.large"
multi_az = true
backup_retention = 30
deletion_protection = true
}
module "app" {
source = "../../modules/ecs-service"
name = "myservice"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnet_ids
image = var.app_image
cpu = 512
memory = 1024
desired_count = 3
database_url = module.database.connection_string
}
State Management
Remote State Data Source
# Reference another project's state
data "terraform_remote_state" "networking" {
backend = "s3"
config = {
bucket = "mycompany-terraform-state"
key = "production/networking/terraform.tfstate"
region = "us-east-1"
}
}
# Use outputs from remote state
resource "aws_instance" "app" {
subnet_id = data.terraform_remote_state.networking.outputs.private_subnet_ids[0]
}
State Operations
# List resources in state
terraform state list
# Show specific resource
terraform state show aws_instance.app
# Move resource (rename or restructure)
terraform state mv aws_instance.app module.compute.aws_instance.app
# Import existing resource
terraform import aws_s3_bucket.existing my-existing-bucket
# Remove from state (without destroying)
terraform state rm aws_instance.legacy
# Force unlock stuck state
terraform force-unlock <lock-id>
CI/CD for Terraform
# .github/workflows/terraform.yml
name: Terraform
on:
pull_request:
paths: ['infrastructure/**']
push:
branches: [main]
paths: ['infrastructure/**']
jobs:
plan:
runs-on: ubuntu-latest
permissions:
pull-requests: write
steps:
- uses: actions/checkout@v4
- uses: hashicorp/setup-terraform@v3
with:
terraform_version: 1.7.5
- name: Terraform Init
run: terraform init
working-directory: infrastructure/environments/production
- name: Terraform Format Check
run: terraform fmt -check -recursive
working-directory: infrastructure
- name: Terraform Validate
run: terraform validate
working-directory: infrastructure/environments/production
- name: Terraform Plan
id: plan
run: terraform plan -no-color -out=tfplan
working-directory: infrastructure/environments/production
- name: Comment plan on PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const plan = `${{ steps.plan.outputs.stdout }}`;
github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: `### Terraform Plan\n\`\`\`\n${plan.substring(0, 65000)}\n\`\`\``
});
apply:
needs: plan
if: github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v4
- uses: hashicorp/setup-terraform@v3
- run: terraform init
working-directory: infrastructure/environments/production
- run: terraform apply -auto-approve
working-directory: infrastructure/environments/production
Common Anti-Patterns
# ❌ BAD: Hardcoded values
resource "aws_instance" "app" {
ami = "ami-0123456789"
instance_type = "t3.medium"
subnet_id = "subnet-abc123"
}
# ✅ GOOD: Variables and data sources
data "aws_ami" "ubuntu" {
most_recent = true
owners = ["099720109477"]
filter {
name = "name"
values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"]
}
}
resource "aws_instance" "app" {
ami = data.aws_ami.ubuntu.id
instance_type = var.instance_type
subnet_id = var.subnet_id
}
# ❌ BAD: No lifecycle management
resource "aws_db_instance" "prod" {
# Can be accidentally destroyed!
}
# ✅ GOOD: Prevent accidental destruction
resource "aws_db_instance" "prod" {
deletion_protection = true
lifecycle {
prevent_destroy = true
}
}
# ❌ BAD: Provisioners for configuration
resource "aws_instance" "app" {
provisioner "remote-exec" {
inline = ["apt-get install nginx"]
}
}
# ✅ GOOD: Use user_data or configuration management
resource "aws_instance" "app" {
user_data = templatefile("${path.module}/userdata.sh", {
app_version = var.app_version
})
}
Useful Commands
# Format all files
terraform fmt -recursive
# Validate configuration
terraform validate
# Plan with variable file
terraform plan -var-file=production.tfvars -out=tfplan
# Apply saved plan
terraform apply tfplan
# Destroy (with caution!)
terraform plan -destroy -out=destroy.tfplan
terraform apply destroy.tfplan
# Graph dependencies
terraform graph | dot -Tsvg > graph.svg
# Show providers
terraform providers
# Refresh state from real infrastructure
terraform refresh
Adapted from buildwithclaude by Dave Poon (MIT)
More from jgarrison929/openclaw-skills
elevenlabs-voices
High-quality voice synthesis with 18 personas, 32 languages, sound effects, batch processing, and voice design using ElevenLabs API.
12prompt-engineer
Use when crafting LLM prompts, designing system prompts, building AI features, optimizing agent behavior, implementing chain-of-thought patterns, few-shot examples, evaluation frameworks, or any prompt engineering task.
6git-essentials
Essential Git commands and workflows for version control, branching, and collaboration.
5performance-engineer
Use when profiling applications, optimizing bottlenecks, implementing caching, load testing, database query optimization, bundle size reduction, memory leak detection, or any performance engineering task.
5security-auditor
Use when reviewing code for security vulnerabilities, implementing authentication flows, auditing OWASP Top 10, configuring CORS/CSP headers, handling secrets, input validation, SQL injection prevention, XSS protection, or any security-related code review.
5nextjs-expert
Use when building Next.js 14/15 applications with the App Router. Invoke for routing, layouts, Server Components, Client Components, Server Actions, Route Handlers, authentication, middleware, data fetching, caching, revalidation, streaming, Suspense, loading states, error boundaries, dynamic routes, parallel routes, intercepting routes, or any Next.js architecture question.
5