基础设施即代码(IaC)实践:Terraform + Ansible 企业级自动化
基础设施即代码(Infrastructure as Code, IaC)是现代云原生架构的核心实践之一。通过代码化管理基础设施,我们可以实现环境的一致性、可重复性和可审计性。本文将深入探讨Terraform和Ansible的企业级实践。
IaC架构设计
整体架构图
graph TB
A[开发者] --> B[Git Repository]
B --> C[CI/CD Pipeline]
C --> D[Terraform Plan]
D --> E[Review & Approval]
E --> F[Terraform Apply]
F --> G[AWS/Azure/GCP]
G --> H[Infrastructure]
H --> I[Ansible Playbooks]
I --> J[Configuration Management]
J --> K[Application Deployment]
L[State Backend] --> F
M[Terraform Modules] --> D
N[Ansible Roles] --> I
技术栈组合
| 层级 | 工具 | 职责 |
|---|---|---|
| 基础设施供应 | Terraform | 云资源创建和管理 |
| 配置管理 | Ansible | 系统配置和应用部署 |
| 状态管理 | Terraform Backend | 状态文件存储和锁定 |
| 密钥管理 | HashiCorp Vault | 敏感信息管理 |
| 版本控制 | Git | 代码版本管理 |
| CI/CD | GitLab CI/Jenkins | 自动化执行 |
Terraform企业级实践
项目结构设计
terraform-infrastructure/
├── environments/
│ ├── dev/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ ├── outputs.tf
│ │ └── terraform.tfvars
│ ├── staging/
│ └── production/
├── modules/
│ ├── vpc/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ ├── outputs.tf
│ │ └── README.md
│ ├── eks/
│ ├── rds/
│ └── security-groups/
├── shared/
│ ├── backend.tf
│ ├── providers.tf
│ └── versions.tf
└── scripts/
├── plan.sh
├── apply.sh
└── destroy.sh
核心配置文件
1. Provider配置
# shared/providers.tf
terraform {
required_version = ">= 1.5.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.20"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.10"
}
}
}
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Environment = var.environment
Project = var.project_name
ManagedBy = "Terraform"
Owner = var.team_name
CostCenter = var.cost_center
}
}
}
provider "kubernetes" {
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
exec {
api_version = "client.authentication.k8s.io/v1beta1"
command = "aws"
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
}
}
2. 后端配置
# shared/backend.tf
terraform {
backend "s3" {
bucket = "company-terraform-state"
key = "environments/${var.environment}/terraform.tfstate"
region = "us-west-2"
encrypt = true
dynamodb_table = "terraform-state-lock"
# 启用状态文件版本控制
versioning = true
}
}
# DynamoDB表用于状态锁定
resource "aws_dynamodb_table" "terraform_state_lock" {
name = "terraform-state-lock"
billing_mode = "PAY_PER_REQUEST"
hash_key = "LockID"
attribute {
name = "LockID"
type = "S"
}
tags = {
Name = "Terraform State Lock Table"
Environment = "shared"
}
}
3. VPC模块
# modules/vpc/main.tf
locals {
availability_zones = data.aws_availability_zones.available.names
public_subnet_cidrs = [
for i, az in local.availability_zones :
cidrsubnet(var.vpc_cidr, 8, i + 1)
]
private_subnet_cidrs = [
for i, az in local.availability_zones :
cidrsubnet(var.vpc_cidr, 8, i + 10)
]
database_subnet_cidrs = [
for i, az in local.availability_zones :
cidrsubnet(var.vpc_cidr, 8, i + 20)
]
}
data "aws_availability_zones" "available" {
state = "available"
}
# VPC
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "${var.project_name}-${var.environment}-vpc"
}
}
# Internet Gateway
resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main.id
tags = {
Name = "${var.project_name}-${var.environment}-igw"
}
}
# Public Subnets
resource "aws_subnet" "public" {
count = length(local.availability_zones)
vpc_id = aws_vpc.main.id
cidr_block = local.public_subnet_cidrs[count.index]
availability_zone = local.availability_zones[count.index]
map_public_ip_on_launch = true
tags = {
Name = "${var.project_name}-${var.environment}-public-${count.index + 1}"
Type = "Public"
"kubernetes.io/role/elb" = "1"
}
}
# Private Subnets
resource "aws_subnet" "private" {
count = length(local.availability_zones)
vpc_id = aws_vpc.main.id
cidr_block = local.private_subnet_cidrs[count.index]
availability_zone = local.availability_zones[count.index]
tags = {
Name = "${var.project_name}-${var.environment}-private-${count.index + 1}"
Type = "Private"
"kubernetes.io/role/internal-elb" = "1"
}
}
# Database Subnets
resource "aws_subnet" "database" {
count = length(local.availability_zones)
vpc_id = aws_vpc.main.id
cidr_block = local.database_subnet_cidrs[count.index]
availability_zone = local.availability_zones[count.index]
tags = {
Name = "${var.project_name}-${var.environment}-database-${count.index + 1}"
Type = "Database"
}
}
# NAT Gateways
resource "aws_eip" "nat" {
count = var.enable_nat_gateway ? length(local.availability_zones) : 0
domain = "vpc"
depends_on = [aws_internet_gateway.main]
tags = {
Name = "${var.project_name}-${var.environment}-nat-eip-${count.index + 1}"
}
}
resource "aws_nat_gateway" "main" {
count = var.enable_nat_gateway ? length(local.availability_zones) : 0
allocation_id = aws_eip.nat[count.index].id
subnet_id = aws_subnet.public[count.index].id
tags = {
Name = "${var.project_name}-${var.environment}-nat-${count.index + 1}"
}
depends_on = [aws_internet_gateway.main]
}
# Route Tables
resource "aws_route_table" "public" {
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.main.id
}
tags = {
Name = "${var.project_name}-${var.environment}-public-rt"
}
}
resource "aws_route_table" "private" {
count = var.enable_nat_gateway ? length(local.availability_zones) : 1
vpc_id = aws_vpc.main.id
dynamic "route" {
for_each = var.enable_nat_gateway ? [1] : []
content {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.main[count.index].id
}
}
tags = {
Name = "${var.project_name}-${var.environment}-private-rt-${count.index + 1}"
}
}
# Route Table Associations
resource "aws_route_table_association" "public" {
count = length(aws_subnet.public)
subnet_id = aws_subnet.public[count.index].id
route_table_id = aws_route_table.public.id
}
resource "aws_route_table_association" "private" {
count = length(aws_subnet.private)
subnet_id = aws_subnet.private[count.index].id
route_table_id = var.enable_nat_gateway ? aws_route_table.private[count.index].id : aws_route_table.private[0].id
}
4. EKS模块
# modules/eks/main.tf
data "aws_iam_policy_document" "eks_cluster_assume_role" {
statement {
effect = "Allow"
principals {
type = "Service"
identifiers = ["eks.amazonaws.com"]
}
actions = ["sts:AssumeRole"]
}
}
resource "aws_iam_role" "eks_cluster" {
name = "${var.cluster_name}-cluster-role"
assume_role_policy = data.aws_iam_policy_document.eks_cluster_assume_role.json
}
resource "aws_iam_role_policy_attachment" "eks_cluster_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.eks_cluster.name
}
resource "aws_iam_role_policy_attachment" "eks_vpc_resource_controller" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController"
role = aws_iam_role.eks_cluster.name
}
# EKS Cluster
resource "aws_eks_cluster" "main" {
name = var.cluster_name
role_arn = aws_iam_role.eks_cluster.arn
version = var.kubernetes_version
vpc_config {
subnet_ids = concat(var.private_subnet_ids, var.public_subnet_ids)
endpoint_private_access = true
endpoint_public_access = var.endpoint_public_access
public_access_cidrs = var.endpoint_public_access_cidrs
security_group_ids = [aws_security_group.eks_cluster.id]
}
encryption_config {
provider {
key_arn = aws_kms_key.eks.arn
}
resources = ["secrets"]
}
enabled_cluster_log_types = var.cluster_log_types
depends_on = [
aws_iam_role_policy_attachment.eks_cluster_policy,
aws_iam_role_policy_attachment.eks_vpc_resource_controller,
aws_cloudwatch_log_group.eks_cluster,
]
tags = var.tags
}
# CloudWatch Log Group
resource "aws_cloudwatch_log_group" "eks_cluster" {
name = "/aws/eks/${var.cluster_name}/cluster"
retention_in_days = var.cluster_log_retention_days
kms_key_id = aws_kms_key.eks.arn
tags = var.tags
}
# KMS Key for EKS
resource "aws_kms_key" "eks" {
description = "EKS Secret Encryption Key"
deletion_window_in_days = 7
enable_key_rotation = true
tags = var.tags
}
resource "aws_kms_alias" "eks" {
name = "alias/${var.cluster_name}-eks"
target_key_id = aws_kms_key.eks.key_id
}
# Security Group for EKS Cluster
resource "aws_security_group" "eks_cluster" {
name_prefix = "${var.cluster_name}-cluster-"
vpc_id = var.vpc_id
ingress {
description = "HTTPS"
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = [var.vpc_cidr]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = merge(var.tags, {
Name = "${var.cluster_name}-cluster-sg"
})
}
# Node Groups
resource "aws_eks_node_group" "main" {
for_each = var.node_groups
cluster_name = aws_eks_cluster.main.name
node_group_name = each.key
node_role_arn = aws_iam_role.eks_node_group.arn
subnet_ids = var.private_subnet_ids
capacity_type = each.value.capacity_type
instance_types = each.value.instance_types
ami_type = each.value.ami_type
disk_size = each.value.disk_size
scaling_config {
desired_size = each.value.desired_size
max_size = each.value.max_size
min_size = each.value.min_size
}
update_config {
max_unavailable_percentage = each.value.max_unavailable_percentage
}
labels = each.value.labels
dynamic "taint" {
for_each = each.value.taints
content {
key = taint.value.key
value = taint.value.value
effect = taint.value.effect
}
}
depends_on = [
aws_iam_role_policy_attachment.eks_worker_node_policy,
aws_iam_role_policy_attachment.eks_cni_policy,
aws_iam_role_policy_attachment.eks_container_registry_policy,
]
tags = var.tags
}
环境配置
# environments/production/main.tf
module "vpc" {
source = "../../modules/vpc"
project_name = var.project_name
environment = var.environment
vpc_cidr = var.vpc_cidr
enable_nat_gateway = true
}
module "eks" {
source = "../../modules/eks"
cluster_name = "${var.project_name}-${var.environment}"
kubernetes_version = var.kubernetes_version
vpc_id = module.vpc.vpc_id
vpc_cidr = module.vpc.vpc_cidr
private_subnet_ids = module.vpc.private_subnet_ids
public_subnet_ids = module.vpc.public_subnet_ids
endpoint_public_access = false
endpoint_public_access_cidrs = var.allowed_cidr_blocks
node_groups = {
general = {
capacity_type = "ON_DEMAND"
instance_types = ["t3.medium"]
ami_type = "AL2_x86_64"
disk_size = 50
desired_size = 3
max_size = 10
min_size = 3
max_unavailable_percentage = 25
labels = {
role = "general"
}
taints = []
}
spot = {
capacity_type = "SPOT"
instance_types = ["t3.medium", "t3.large"]
ami_type = "AL2_x86_64"
disk_size = 50
desired_size = 2
max_size = 20
min_size = 0
max_unavailable_percentage = 50
labels = {
role = "spot"
}
taints = [
{
key = "spot"
value = "true"
effect = "NO_SCHEDULE"
}
]
}
}
tags = local.common_tags
}
# environments/production/variables.tf
variable "project_name" {
description = "Name of the project"
type = string
default = "myapp"
}
variable "environment" {
description = "Environment name"
type = string
default = "production"
}
variable "vpc_cidr" {
description = "CIDR block for VPC"
type = string
default = "10.0.0.0/16"
}
variable "kubernetes_version" {
description = "Kubernetes version"
type = string
default = "1.27"
}
variable "allowed_cidr_blocks" {
description = "CIDR blocks allowed to access EKS API"
type = list(string)
default = ["10.0.0.0/8"]
}
# environments/production/terraform.tfvars
project_name = "myapp"
environment = "production"
vpc_cidr = "10.0.0.0/16"
kubernetes_version = "1.27"
allowed_cidr_blocks = [
"10.0.0.0/8",
"172.16.0.0/12"
]
Ansible配置管理
项目结构
ansible-configuration/
├── inventories/
│ ├── dev/
│ │ ├── hosts.yml
│ │ └── group_vars/
│ ├── staging/
│ └── production/
├── roles/
│ ├── common/
│ │ ├── tasks/main.yml
│ │ ├── handlers/main.yml
│ │ ├── templates/
│ │ ├── files/
│ │ └── vars/main.yml
│ ├── docker/
│ ├── kubernetes/
│ └── monitoring/
├── playbooks/
│ ├── site.yml
│ ├── deploy.yml
│ └── maintenance.yml
├── group_vars/
│ ├── all.yml
│ └── production.yml
└── ansible.cfg
核心Playbook
# playbooks/site.yml
---
- name: Configure all servers
hosts: all
become: yes
gather_facts: yes
pre_tasks:
- name: Update package cache
package:
update_cache: yes
when: ansible_os_family in ['Debian', 'RedHat']
roles:
- common
- security
- monitoring
- name: Configure Kubernetes nodes
hosts: kubernetes
become: yes
roles:
- docker
- kubernetes
- cni
- name: Configure database servers
hosts: database
become: yes
roles:
- postgresql
- backup
- monitoring
# roles/common/tasks/main.yml
---
- name: Install essential packages
package:
name:
- curl
- wget
- git
- htop
- vim
- unzip
- jq
state: present
- name: Configure timezone
timezone:
name: "{{ system_timezone | default('UTC') }}"
- name: Configure NTP
template:
src: ntp.conf.j2
dest: /etc/ntp.conf
backup: yes
notify: restart ntp
- name: Create application user
user:
name: "{{ app_user }}"
shell: /bin/bash
home: "/home/{{ app_user }}"
create_home: yes
groups: docker
append: yes
- name: Configure SSH
template:
src: sshd_config.j2
dest: /etc/ssh/sshd_config
backup: yes
validate: sshd -t -f %s
notify: restart ssh
- name: Configure firewall
ufw:
rule: "{{ item.rule }}"
port: "{{ item.port }}"
proto: "{{ item.proto | default('tcp') }}"
loop:
- { rule: 'allow', port: '22' }
- { rule: 'allow', port: '80' }
- { rule: 'allow', port: '443' }
notify: enable ufw
# roles/docker/tasks/main.yml
---
- name: Add Docker GPG key
apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
- name: Add Docker repository
apt_repository:
repo: "deb [arch=amd64] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
state: present
- name: Install Docker
package:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-compose-plugin
state: present
- name: Configure Docker daemon
template:
src: daemon.json.j2
dest: /etc/docker/daemon.json
notify: restart docker
- name: Start and enable Docker
systemd:
name: docker
state: started
enabled: yes
- name: Add users to docker group
user:
name: "{{ item }}"
groups: docker
append: yes
loop: "{{ docker_users }}"
# roles/kubernetes/tasks/main.yml
---
- name: Add Kubernetes GPG key
apt_key:
url: https://packages.cloud.google.com/apt/doc/apt-key.gpg
state: present
- name: Add Kubernetes repository
apt_repository:
repo: "deb https://apt.kubernetes.io/ kubernetes-xenial main"
state: present
- name: Install Kubernetes components
package:
name:
- kubelet={{ kubernetes_version }}
- kubeadm={{ kubernetes_version }}
- kubectl={{ kubernetes_version }}
state: present
allow_downgrade: yes
- name: Hold Kubernetes packages
dpkg_selections:
name: "{{ item }}"
selection: hold
loop:
- kubelet
- kubeadm
- kubectl
- name: Configure kubelet
template:
src: kubelet-config.yaml.j2
dest: /var/lib/kubelet/config.yaml
notify: restart kubelet
- name: Start and enable kubelet
systemd:
name: kubelet
state: started
enabled: yes
动态Inventory
#!/usr/bin/env python3
# inventories/aws_ec2.py
import boto3
import json
import sys
def get_ec2_instances():
ec2 = boto3.client('ec2')
response = ec2.describe_instances(
Filters=[
{'Name': 'instance-state-name', 'Values': ['running']},
{'Name': 'tag:Environment', 'Values': [sys.argv[1] if len(sys.argv) > 1 else 'production']}
]
)
inventory = {
'_meta': {
'hostvars': {}
},
'all': {
'children': ['ungrouped']
}
}
for reservation in response['Reservations']:
for instance in reservation['Instances']:
instance_id = instance['InstanceId']
private_ip = instance.get('PrivateIpAddress', '')
public_ip = instance.get('PublicIpAddress', '')
# 获取标签
tags = {tag['Key']: tag['Value'] for tag in instance.get('Tags', [])}
# 主机变量
hostvars = {
'ansible_host': public_ip or private_ip,
'ansible_user': 'ubuntu',
'instance_id': instance_id,
'instance_type': instance['InstanceType'],
'private_ip': private_ip,
'public_ip': public_ip,
'tags': tags
}
inventory['_meta']['hostvars'][instance_id] = hostvars
# 根据标签分组
role = tags.get('Role', 'ungrouped')
if role not in inventory:
inventory[role] = {'hosts': []}
inventory[role]['hosts'].append(instance_id)
# 根据环境分组
env = tags.get('Environment', 'unknown')
env_group = f"env_{env}"
if env_group not in inventory:
inventory[env_group] = {'hosts': []}
inventory[env_group]['hosts'].append(instance_id)
return inventory
if __name__ == '__main__':
if len(sys.argv) == 2 and sys.argv[1] == '--list':
print(json.dumps(get_ec2_instances(), indent=2))
elif len(sys.argv) == 3 and sys.argv[1] == '--host':
print(json.dumps({}))
else:
print("Usage: %s --list or %s --host <hostname>" % (sys.argv[0], sys.argv[0]))
CI/CD集成
GitLab CI配置
# .gitlab-ci.yml for Infrastructure
stages:
- validate
- plan
- apply
- configure
variables:
TF_ROOT: ${CI_PROJECT_DIR}/environments/${ENVIRONMENT}
TF_ADDRESS: ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/terraform/state/${ENVIRONMENT}
cache:
key: "${ENVIRONMENT}"
paths:
- ${TF_ROOT}/.terraform
before_script:
- cd ${TF_ROOT}
- terraform --version
- terraform init -backend-config="address=${TF_ADDRESS}" -backend-config="lock_address=${TF_ADDRESS}/lock" -backend-config="unlock_address=${TF_ADDRESS}/lock" -backend-config="username=${GITLAB_USER_LOGIN}" -backend-config="password=${CI_JOB_TOKEN}" -backend-config="lock_method=POST" -backend-config="unlock_method=DELETE" -backend-config="retry_wait_min=5"
validate:
stage: validate
script:
- terraform validate
- terraform fmt -check
only:
- merge_requests
- main
plan:
stage: plan
script:
- terraform plan -out="planfile"
artifacts:
name: plan
paths:
- ${TF_ROOT}/planfile
expire_in: 1 week
only:
- merge_requests
- main
apply:
stage: apply
script:
- terraform apply -input=false "planfile"
dependencies:
- plan
when: manual
only:
- main
environment:
name: ${ENVIRONMENT}
configure:
stage: configure
image: ansible/ansible:latest
script:
- cd ansible-configuration
- ansible-playbook -i inventories/${ENVIRONMENT}/hosts.yml playbooks/site.yml
dependencies:
- apply
only:
- main
自动化脚本
#!/bin/bash
# scripts/deploy.sh
set -euo pipefail
ENVIRONMENT=${1:-dev}
ACTION=${2:-plan}
echo "🚀 Starting infrastructure deployment for environment: $ENVIRONMENT"
# 验证环境
if [[ ! -d "environments/$ENVIRONMENT" ]]; then
echo "❌ Environment $ENVIRONMENT does not exist"
exit 1
fi
cd "environments/$ENVIRONMENT"
# 初始化Terraform
echo "📦 Initializing Terraform..."
terraform init
# 验证配置
echo "✅ Validating Terraform configuration..."
terraform validate
# 格式化检查
echo "🎨 Checking Terraform formatting..."
terraform fmt -check
case $ACTION in
"plan")
echo "📋 Creating Terraform plan..."
terraform plan -out=tfplan
;;
"apply")
echo "🔨 Applying Terraform changes..."
if [[ -f "tfplan" ]]; then
terraform apply tfplan
else
terraform apply -auto-approve
fi
echo "⚙️ Running Ansible configuration..."
cd ../../ansible-configuration
ansible-playbook -i inventories/$ENVIRONMENT/hosts.yml playbooks/site.yml
;;
"destroy")
echo "💥 Destroying infrastructure..."
terraform destroy -auto-approve
;;
*)
echo "❌ Unknown action: $ACTION"
echo "Usage: $0 <environment> <plan|apply|destroy>"
exit 1
;;
esac
echo "✅ Infrastructure deployment completed successfully!"
最佳实践总结
1. 状态管理
# 远程状态配置
terraform {
backend "s3" {
bucket = "terraform-state-bucket"
key = "path/to/terraform.tfstate"
region = "us-west-2"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
# 状态导入示例
terraform import aws_instance.example i-1234567890abcdef0
2. 安全最佳实践
# 密钥管理
export AWS_ACCESS_KEY_ID=$(vault kv get -field=access_key secret/aws/terraform)
export AWS_SECRET_ACCESS_KEY=$(vault kv get -field=secret_key secret/aws/terraform)
# Terraform变量加密
terraform plan -var-file="secrets.tfvars.encrypted"
# Ansible Vault
ansible-vault encrypt group_vars/production/secrets.yml
ansible-playbook --ask-vault-pass playbooks/site.yml
3. 监控和告警
# Prometheus监控规则
groups:
- name: infrastructure
rules:
- alert: TerraformDrift
expr: terraform_state_drift > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Terraform state drift detected"
- alert: AnsiblePlaybookFailed
expr: ansible_playbook_failures > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Ansible playbook execution failed"
通过本文的实践指南,您可以构建一个完整的基础设施即代码解决方案,实现基础设施的自动化管理和配置,提高运维效率和系统可靠性。
基础设施即代码是现代运维的核心实践,通过Terraform和Ansible的结合使用,可以实现从基础设施供应到应用配置的全流程自动化。