企业级可观测性平台建设:Prometheus + Grafana + Jaeger + ELK 完整方案
可观测性(Observability)是现代分布式系统运维的核心能力,通过指标(Metrics)、日志(Logs)和链路追踪(Traces)三大支柱,为系统提供全方位的监控和诊断能力。本文将详细介绍如何构建一个完整的企业级可观测性平台。
可观测性架构设计
整体架构图
graph TB
subgraph "数据源层"
A1[应用服务] --> B1[Metrics]
A1 --> B2[Logs]
A1 --> B3[Traces]
A2[基础设施] --> B1
A2 --> B2
A3[中间件] --> B1
A3 --> B2
A3 --> B3
end
subgraph "数据收集层"
B1 --> C1[Prometheus]
B2 --> C2[Filebeat/Fluentd]
B3 --> C3[Jaeger Agent]
C2 --> C4[Logstash]
C3 --> C5[Jaeger Collector]
end
subgraph "数据存储层"
C1 --> D1[Prometheus TSDB]
C4 --> D2[Elasticsearch]
C5 --> D3[Jaeger Storage]
D1 --> D4[Thanos/VictoriaMetrics]
end
subgraph "数据处理层"
D4 --> E1[PromQL查询]
D2 --> E2[Elasticsearch查询]
D3 --> E3[Jaeger查询]
end
subgraph "可视化层"
E1 --> F1[Grafana]
E2 --> F2[Kibana]
E3 --> F3[Jaeger UI]
F1 --> F4[统一Dashboard]
end
subgraph "告警层"
E1 --> G1[AlertManager]
G1 --> G2[通知渠道]
end
技术栈选型
| 组件类型 | 技术选择 | 职责 | 优势 |
|---|---|---|---|
| 指标监控 | Prometheus + Thanos | 时序数据收集和存储 | 高性能、生态丰富 |
| 日志聚合 | ELK Stack | 日志收集、处理、存储 | 成熟稳定、功能强大 |
| 链路追踪 | Jaeger | 分布式追踪 | 云原生、性能优秀 |
| 可视化 | Grafana | 统一监控面板 | 插件丰富、界面友好 |
| 告警 | AlertManager | 告警路由和通知 | 灵活的告警规则 |
| 服务发现 | Consul/Kubernetes | 动态服务发现 | 自动化配置 |
Prometheus监控体系
Prometheus集群配置
# prometheus-config.yaml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
region: 'us-west-2'
# 告警规则文件
rule_files:
- "/etc/prometheus/rules/*.yml"
# 告警管理器配置
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager-1:9093
- alertmanager-2:9093
- alertmanager-3:9093
# 抓取配置
scrape_configs:
# Prometheus自监控
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 5s
metrics_path: /metrics
# Kubernetes API Server
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- default
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Kubernetes节点监控
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
# cAdvisor容器监控
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
# Pod监控
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# 服务监控
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# 应用自定义监控
- job_name: 'application-metrics'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['app-metrics']
relabel_configs:
- source_labels: [__meta_consul_tags]
regex: '.*,metrics,.*'
action: keep
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: [__meta_consul_node]
target_label: instance
# 远程写入配置(用于长期存储)
remote_write:
- url: "http://thanos-receive:19291/api/v1/receive"
queue_config:
max_samples_per_send: 1000
max_shards: 200
capacity: 2500
# 远程读取配置
remote_read:
- url: "http://thanos-query:9090/api/v1/query"
read_recent: true
告警规则配置
# alerts/infrastructure.yml
groups:
- name: infrastructure
rules:
# 节点宕机告警
- alert: NodeDown
expr: up{job="kubernetes-nodes"} == 0
for: 1m
labels:
severity: critical
team: infrastructure
annotations:
summary: "Node {{ $labels.instance }} is down"
description: "Node {{ $labels.instance }} has been down for more than 1 minute."
runbook_url: "https://runbooks.company.com/node-down"
# CPU使用率过高
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for more than 5 minutes on {{ $labels.instance }}."
# 内存使用率过高
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 85% for more than 5 minutes on {{ $labels.instance }}."
# 磁盘空间不足
- alert: DiskSpaceLow
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Disk usage is above 85% on {{ $labels.instance }} mount {{ $labels.mountpoint }}."
# 磁盘IO等待时间过长
- alert: HighDiskIOWait
expr: irate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 20
for: 5m
labels:
severity: warning
team: infrastructure
annotations:
summary: "High disk I/O wait on {{ $labels.instance }}"
description: "Disk I/O wait time is above 20% for more than 5 minutes on {{ $labels.instance }}."
- name: kubernetes
rules:
# Pod重启频繁
- alert: PodRestartingTooOften
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} restarting too often"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has restarted {{ $value }} times in the last hour."
# Pod处于Pending状态
- alert: PodStuckInPending
expr: kube_pod_status_phase{phase="Pending"} == 1
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} stuck in Pending"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in Pending state for more than 10 minutes."
# Deployment副本数不匹配
- alert: DeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_available_replicas
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch"
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has {{ $value }} available replicas, expected {{ $labels.spec_replicas }}."
- name: application
rules:
# HTTP错误率过高
- alert: HighHTTPErrorRate
expr: (sum(rate(http_requests_total{status=~"5.."}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service)) * 100 > 5
for: 5m
labels:
severity: critical
team: application
annotations:
summary: "High HTTP error rate for {{ $labels.service }}"
description: "HTTP error rate is {{ $value }}% for service {{ $labels.service }}."
# 响应时间过长
- alert: HighResponseTime
expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service)) > 1
for: 5m
labels:
severity: warning
team: application
annotations:
summary: "High response time for {{ $labels.service }}"
description: "95th percentile response time is {{ $value }}s for service {{ $labels.service }}."
# 数据库连接池耗尽
- alert: DatabaseConnectionPoolExhausted
expr: db_connection_pool_active / db_connection_pool_max > 0.9
for: 2m
labels:
severity: critical
team: database
annotations:
summary: "Database connection pool nearly exhausted"
description: "Database connection pool usage is {{ $value | humanizePercentage }} for {{ $labels.database }}."
Thanos长期存储配置
# thanos-sidecar.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-with-thanos
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prom/prometheus:v2.45.0
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=2h' # 本地只保留2小时
- '--storage.tsdb.min-block-duration=2h'
- '--storage.tsdb.max-block-duration=2h'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
ports:
- containerPort: 9090
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: storage
mountPath: /prometheus
- name: thanos-sidecar
image: thanosio/thanos:v0.32.0
args:
- sidecar
- --tsdb.path=/prometheus
- --prometheus.url=http://localhost:9090
- --grpc-address=0.0.0.0:10901
- --http-address=0.0.0.0:10902
- --objstore.config-file=/etc/thanos/objstore.yml
ports:
- containerPort: 10901
name: grpc
- containerPort: 10902
name: http
volumeMounts:
- name: storage
mountPath: /prometheus
- name: objstore-config
mountPath: /etc/thanos
volumes:
- name: config
configMap:
name: grafana-config
- name: storage
persistentVolumeClaim:
claimName: grafana-storage
- name: dashboards
configMap:
name: grafana-dashboards
核心Dashboard配置
{
"dashboard": {
"id": null,
"title": "可观测性总览",
"tags": ["observability", "overview"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "系统健康状态",
"type": "stat",
"targets": [
{
"expr": "up",
"legendFormat": "{{instance}}"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "green", "value": 1}
]
}
}
}
},
{
"id": 2,
"title": "请求响应时间",
"type": "timeseries",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))",
"legendFormat": "95th percentile - {{service}}"
}
]
},
{
"id": 3,
"title": "错误率趋势",
"type": "timeseries",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (service) / sum(rate(http_requests_total[5m])) by (service)",
"legendFormat": "Error Rate - {{service}}"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
告警系统配置
AlertManager配置
# alertmanager-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
data:
alertmanager.yml: |
global:
smtp_smarthost: 'smtp.company.com:587'
smtp_from: 'alerts@company.com'
smtp_auth_username: 'alerts@company.com'
smtp_auth_password: '${SMTP_PASSWORD}'
slack_api_url: '${SLACK_API_URL}'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 5s
repeat_interval: 30m
- match:
team: infrastructure
receiver: 'infrastructure-team'
- match:
team: application
receiver: 'application-team'
- match:
team: database
receiver: 'database-team'
receivers:
- name: 'default'
email_configs:
- to: 'ops@company.com'
subject: '[{{ .Status | toUpper }}] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
{{ end }}
- name: 'critical-alerts'
email_configs:
- to: 'critical-alerts@company.com'
subject: '[CRITICAL] {{ .GroupLabels.alertname }}'
body: |
CRITICAL ALERT TRIGGERED
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Service: {{ .Labels.service }}
Instance: {{ .Labels.instance }}
Time: {{ .StartsAt }}
{{ end }}
slack_configs:
- channel: '#critical-alerts'
title: 'Critical Alert: {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Severity:* {{ .Labels.severity }}
*Service:* {{ .Labels.service }}
{{ end }}
send_resolved: true
pagerduty_configs:
- routing_key: '${PAGERDUTY_ROUTING_KEY}'
description: '{{ .GroupLabels.alertname }}: {{ .Annotations.summary }}'
- name: 'infrastructure-team'
email_configs:
- to: 'infrastructure@company.com'
subject: '[{{ .Status | toUpper }}] Infrastructure Alert'
slack_configs:
- channel: '#infrastructure-alerts'
send_resolved: true
- name: 'application-team'
email_configs:
- to: 'application@company.com'
subject: '[{{ .Status | toUpper }}] Application Alert'
slack_configs:
- channel: '#application-alerts'
send_resolved: true
- name: 'database-team'
email_configs:
- to: 'database@company.com'
subject: '[{{ .Status | toUpper }}] Database Alert'
slack_configs:
- channel: '#database-alerts'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
- source_match:
alertname: 'NodeDown'
target_match_re:
alertname: '(HighCPUUsage|HighMemoryUsage|DiskSpaceLow)'
equal: ['instance']
最佳实践与优化
1. 性能优化策略
- 数据保留策略: Prometheus本地保留2小时,Thanos长期存储
- 采样策略: 根据服务重要性调整Jaeger采样率
- 索引优化: Elasticsearch按日期分片,定期清理旧数据
- 查询优化: 使用PromQL最佳实践,避免高基数标签
2. 安全加固措施
- 传输加密: 所有组件间通信使用TLS
- 身份认证: 集成LDAP/AD进行统一认证
- 权限控制: 基于RBAC的细粒度权限管理
- 数据脱敏: 日志中敏感信息自动脱敏
3. 高可用设计
- 多副本部署: 关键组件多副本部署
- 跨区域备份: 数据跨可用区备份
- 故障转移: 自动故障检测和切换
- 容量规划: 基于历史数据进行容量预测
4. 运维自动化
- 自动扩缩容: 基于负载自动调整资源
- 智能告警: 基于机器学习的异常检测
- 自愈机制: 常见故障自动修复
- 变更管理: GitOps流程管理配置变更
总结
企业级可观测性平台的建设是一个系统工程,需要综合考虑技术选型、架构设计、性能优化、安全加固等多个方面。通过Prometheus + Grafana + Jaeger + ELK的技术栈组合,可以构建一个功能完整、性能优秀、安全可靠的可观测性平台,为企业的数字化转型提供强有力的技术支撑。
在实际实施过程中,建议采用渐进式的部署策略,先从核心监控开始,逐步扩展到日志聚合和链路追踪,最终形成完整的可观测性体系。同时,要重视团队培训和流程建设,确保平台能够真正发挥价值,提升系统的可靠性和运维效率。: name: prometheus-config - name: storage persistentVolumeClaim: claimName: prometheus-storage - name: objstore-config secret: secretName: thanos-objstore-config
thanos-store.yaml
apiVersion: apps/v1 kind: Deployment metadata: name: thanos-store spec: replicas: 2 selector: matchLabels: app: thanos-store template: metadata: labels: app: thanos-store spec: containers: - name: thanos-store image: thanosio/thanos:v0.32.0 args: - store - –grpc-address=0.0.0.0:10901 - –http-address=0.0.0.0:10902 - –data-dir=/var/thanos/store - –objstore.config-file=/etc/thanos/objstore.yml - –index-cache-size=250MB - –chunk-pool-size=1GB ports: - containerPort: 10901 name: grpc - containerPort: 10902 name: http volumeMounts: - name: data mountPath: /var/thanos/store - name: objstore-config mountPath: /etc/thanos resources: requests: memory: 2Gi cpu: 500m limits: memory: 4Gi cpu: 1
volumes:
- name: data
emptyDir: {}
- name: objstore-config
secret:
secretName: thanos-objstore-config
thanos-query.yaml
apiVersion: apps/v1 kind: Deployment metadata: name: thanos-query spec: replicas: 2 selector: matchLabels: app: thanos-query template: metadata: labels: app: thanos-query spec: containers: - name: thanos-query image: thanosio/thanos:v0.32.0 args: - query - –grpc-address=0.0.0.0:10901 - –http-address=0.0.0.0:9090 - –store=thanos-sidecar:10901 - –store=thanos-store:10901 - –query.replica-label=replica - –query.auto-downsampling ports: - containerPort: 9090 name: http - containerPort: 10901 name: grpc resources: requests: memory: 512Mi cpu: 250m limits: memory: 1Gi cpu: 500m
## ELK日志聚合系统
### Elasticsearch集群配置
```yaml
# elasticsearch-master.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: elasticsearch-master
spec:
serviceName: elasticsearch-master
replicas: 3
selector:
matchLabels:
app: elasticsearch
role: master
template:
metadata:
labels:
app: elasticsearch
role: master
spec:
initContainers:
- name: increase-vm-max-map
image: busybox
command: ["sysctl", "-w", "vm.max_map_count=262144"]
securityContext:
privileged: true
- name: increase-fd-ulimit
image: busybox
command: ["sh", "-c", "ulimit -n 65536"]
securityContext:
privileged: true
containers:
- name: elasticsearch
image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
env:
- name: node.name
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: cluster.name
value: "production-cluster"
- name: discovery.seed_hosts
value: "elasticsearch-master-0.elasticsearch-master,elasticsearch-master-1.elasticsearch-master,elasticsearch-master-2.elasticsearch-master"
- name: cluster.initial_master_nodes
value: "elasticsearch-master-0,elasticsearch-master-1,elasticsearch-master-2"
- name: node.roles
value: "master"
- name: ES_JAVA_OPTS
value: "-Xms2g -Xmx2g"
- name: xpack.security.enabled
value: "true"
- name: xpack.security.transport.ssl.enabled
value: "true"
- name: xpack.security.transport.ssl.verification_mode
value: "certificate"
- name: xpack.security.transport.ssl.keystore.path
value: "/usr/share/elasticsearch/config/certs/elastic-certificates.p12"
- name: xpack.security.transport.ssl.truststore.path
value: "/usr/share/elasticsearch/config/certs/elastic-certificates.p12"
- name: xpack.security.http.ssl.enabled
value: "true"
- name: xpack.security.http.ssl.keystore.path
value: "/usr/share/elasticsearch/config/certs/elastic-certificates.p12"
ports:
- containerPort: 9200
name: http
- containerPort: 9300
name: transport
volumeMounts:
- name: data
mountPath: /usr/share/elasticsearch/data
- name: certs
mountPath: /usr/share/elasticsearch/config/certs
readOnly: true
resources:
requests:
memory: 2Gi
cpu: 1
limits:
memory: 4Gi
cpu: 2
volumes:
- name: certs
secret:
secretName: elastic-certificates
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: "fast-ssd"
resources:
requests:
storage: 100Gi
---
# elasticsearch-data.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: elasticsearch-data
spec:
serviceName: elasticsearch-data
replicas: 6
selector:
matchLabels:
app: elasticsearch
role: data
template:
metadata:
labels:
app: elasticsearch
role: data
spec:
containers:
- name: elasticsearch
image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
env:
- name: node.name
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: cluster.name
value: "production-cluster"
- name: discovery.seed_hosts
value: "elasticsearch-master"
- name: node.roles
value: "data,ingest"
- name: ES_JAVA_OPTS
value: "-Xms4g -Xmx4g"
- name: xpack.security.enabled
value: "true"
- name: xpack.security.transport.ssl.enabled
value: "true"
- name: xpack.security.transport.ssl.verification_mode
value: "certificate"
- name: xpack.security.transport.ssl.keystore.path
value: "/usr/share/elasticsearch/config/certs/elastic-certificates.p12"
- name: xpack.security.transport.ssl.truststore.path
value: "/usr/share/elasticsearch/config/certs/elastic-certificates.p12"
ports:
- containerPort: 9200
name: http
- containerPort: 9300
name: transport
volumeMounts:
- name: data
mountPath: /usr/share/elasticsearch/data
- name: certs
mountPath: /usr/share/elasticsearch/config/certs
readOnly: true
resources:
requests:
memory: 4Gi
cpu: 2
limits:
memory: 8Gi
cpu: 4
volumes:
- name: certs
secret:
secretName: elastic-certificates
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: "fast-ssd"
resources:
requests:
storage: 500Gi
Logstash配置
# logstash-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: logstash-config
data:
logstash.yml: |
http.host: "0.0.0.0"
path.config: /usr/share/logstash/pipeline
xpack.monitoring.enabled: true
xpack.monitoring.elasticsearch.hosts: ["https://elasticsearch:9200"]
xpack.monitoring.elasticsearch.username: "logstash_system"
xpack.monitoring.elasticsearch.password: "${LOGSTASH_PASSWORD}"
xpack.monitoring.elasticsearch.ssl.certificate_authority: "/usr/share/logstash/config/certs/ca.crt"
pipelines.yml: |
- pipeline.id: application-logs
path.config: "/usr/share/logstash/pipeline/application.conf"
pipeline.workers: 4
pipeline.batch.size: 1000
pipeline.batch.delay: 50
- pipeline.id: infrastructure-logs
path.config: "/usr/share/logstash/pipeline/infrastructure.conf"
pipeline.workers: 2
pipeline.batch.size: 500
pipeline.batch.delay: 100
application.conf: |
input {
beats {
port => 5044
type => "application"
}
kafka {
bootstrap_servers => "kafka:9092"
topics => ["application-logs"]
group_id => "logstash-application"
codec => "json"
}
}
filter {
# 解析JSON格式的应用日志
if [fields][log_type] == "application" {
json {
source => "message"
}
# 解析时间戳
date {
match => [ "timestamp", "ISO8601" ]
target => "@timestamp"
}
# 提取错误信息
if [level] == "ERROR" {
grok {
match => { "message" => "%{JAVACLASS:exception_class}: %{GREEDYDATA:exception_message}" }
tag_on_failure => ["_grokparsefailure_exception"]
}
}
# 添加地理位置信息
if [client_ip] {
geoip {
source => "client_ip"
target => "geoip"
}
}
# 敏感信息脱敏
mutate {
gsub => [
"message", "password=[^&\s]*", "password=***",
"message", "token=[^&\s]*", "token=***",
"message", "apikey=[^&\s]*", "apikey=***"
]
}
}
# 性能指标提取
if [fields][log_type] == "performance" {
grok {
match => { "message" => "response_time=%{NUMBER:response_time:float} status=%{NUMBER:status_code:int} method=%{WORD:http_method} path=%{URIPATH:request_path}" }
}
# 计算响应时间分类
if [response_time] {
if [response_time] < 100 {
mutate { add_field => { "performance_category" => "fast" } }
} else if [response_time] < 500 {
mutate { add_field => { "performance_category" => "normal" } }
} else if [response_time] < 1000 {
mutate { add_field => { "performance_category" => "slow" } }
} else {
mutate { add_field => { "performance_category" => "very_slow" } }
}
}
}
}
output {
# 根据日志级别路由到不同索引
if [level] == "ERROR" or [level] == "FATAL" {
elasticsearch {
hosts => ["https://elasticsearch:9200"]
index => "application-errors-%{+YYYY.MM.dd}"
user => "logstash_writer"
password => "${LOGSTASH_PASSWORD}"
ssl => true
cacert => "/usr/share/logstash/config/certs/ca.crt"
}
# 错误日志同时发送到告警系统
http {
url => "http://alertmanager:9093/api/v1/alerts"
http_method => "post"
format => "json"
mapping => {
"alerts" => [{
"labels" => {
"alertname" => "ApplicationError"
"severity" => "critical"
"service" => "%{[fields][service]}"
"environment" => "%{[fields][environment]}"
}
"annotations" => {
"summary" => "Application error detected"
"description" => "%{message}"
}
}]
}
}
} else {
elasticsearch {
hosts => ["https://elasticsearch:9200"]
index => "application-logs-%{+YYYY.MM.dd}"
user => "logstash_writer"
password => "${LOGSTASH_PASSWORD}"
ssl => true
cacert => "/usr/share/logstash/config/certs/ca.crt"
}
}
# 性能指标发送到时序数据库
if [fields][log_type] == "performance" {
influxdb {
host => "influxdb"
port => 8086
database => "performance_metrics"
measurement => "http_requests"
send_as_tags => ["http_method", "status_code", "performance_category"]
}
}
}
infrastructure.conf: |
input {
beats {
port => 5045
type => "infrastructure"
}
}
filter {
# 系统日志解析
if [fields][log_type] == "syslog" {
grok {
match => { "message" => "%{SYSLOGTIMESTAMP:timestamp} %{IPORHOST:host} %{PROG:program}(?:\[%{POSINT:pid}\])?: %{GREEDYDATA:message}" }
overwrite => [ "message" ]
}
date {
match => [ "timestamp", "MMM d HH:mm:ss", "MMM dd HH:mm:ss" ]
}
}
# Kubernetes日志解析
if [kubernetes] {
# 提取Pod信息
mutate {
add_field => {
"k8s_namespace" => "%{[kubernetes][namespace]}"
"k8s_pod" => "%{[kubernetes][pod][name]}"
"k8s_container" => "%{[kubernetes][container][name]}"
}
}
# 解析容器日志
if [k8s_container] == "nginx" {
grok {
match => { "message" => "%{COMBINEDAPACHELOG}" }
}
}
}
}
output {
elasticsearch {
hosts => ["https://elasticsearch:9200"]
index => "infrastructure-logs-%{+YYYY.MM.dd}"
user => "logstash_writer"
password => "${LOGSTASH_PASSWORD}"
ssl => true
cacert => "/usr/share/logstash/config/certs/ca.crt"
}
}
---
# logstash-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: logstash
spec:
replicas: 3
selector:
matchLabels:
app: logstash
template:
metadata:
labels:
app: logstash
spec:
containers:
- name: logstash
image: docker.elastic.co/logstash/logstash:8.9.0
env:
- name: LOGSTASH_PASSWORD
valueFrom:
secretKeyRef:
name: elastic-credentials
key: logstash-password
ports:
- containerPort: 5044
name: beats
- containerPort: 5045
name: beats-infra
- containerPort: 9600
name: http
volumeMounts:
- name: config
mountPath: /usr/share/logstash/config
- name: pipeline
mountPath: /usr/share/logstash/pipeline
- name: certs
mountPath: /usr/share/logstash/config/certs
readOnly: true
resources:
requests:
memory: 2Gi
cpu: 1
limits:
memory: 4Gi
cpu: 2
volumes:
- name: config
configMap:
name: logstash-config
items:
- key: logstash.yml
path: logstash.yml
- key: pipelines.yml
path: pipelines.yml
- name: pipeline
configMap:
name: logstash-config
items:
- key: application.conf
path: application.conf
- key: infrastructure.conf
path: infrastructure.conf
- name: certs
secret:
secretName: elastic-certificates
Filebeat配置
# filebeat-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: filebeat-config
data:
filebeat.yml: |
filebeat.inputs:
# 应用日志收集
- type: log
enabled: true
paths:
- /var/log/applications/*.log
- /var/log/applications/*/*.log
fields:
log_type: application
environment: ${ENVIRONMENT:production}
fields_under_root: true
multiline.pattern: '^\d{4}-\d{2}-\d{2}'
multiline.negate: true
multiline.match: after
exclude_lines: ['^DEBUG']
# Kubernetes容器日志
- type: container
enabled: true
paths:
- /var/log/containers/*.log
processors:
- add_kubernetes_metadata:
host: ${NODE_NAME}
matchers:
- logs_path:
logs_path: "/var/log/containers/"
- drop_event:
when:
or:
- equals:
kubernetes.container.name: "filebeat"
- equals:
kubernetes.container.name: "logstash"
# 系统日志
- type: log
enabled: true
paths:
- /var/log/syslog
- /var/log/auth.log
- /var/log/kern.log
fields:
log_type: syslog
fields_under_root: true
# 审计日志
- type: log
enabled: true
paths:
- /var/log/audit/audit.log
fields:
log_type: audit
fields_under_root: true
processors:
# 添加主机信息
- add_host_metadata:
when.not.contains.tags: forwarded
# 添加Docker信息
- add_docker_metadata: ~
# 字段重命名和清理
- rename:
fields:
- from: "agent"
to: "filebeat_agent"
ignore_missing: true
# 删除不需要的字段
- drop_fields:
fields: ["ecs", "agent.ephemeral_id", "agent.id", "agent.version"]
output.logstash:
hosts: ["logstash:5044"]
loadbalance: true
worker: 2
# 监控配置
monitoring.enabled: true
monitoring.elasticsearch:
hosts: ["https://elasticsearch:9200"]
username: "filebeat_system"
password: "${FILEBEAT_PASSWORD}"
ssl.certificate_authority: "/usr/share/filebeat/certs/ca.crt"
# 日志配置
logging.level: info
logging.to_files: true
logging.files:
path: /var/log/filebeat
name: filebeat
keepfiles: 7
permissions: 0644
---
# filebeat-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: filebeat
spec:
selector:
matchLabels:
app: filebeat
template:
metadata:
labels:
app: filebeat
spec:
serviceAccountName: filebeat
terminationGracePeriodSeconds: 30
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
containers:
- name: filebeat
image: docker.elastic.co/beats/filebeat:8.9.0
args: [
"-c", "/etc/filebeat.yml",
"-e",
]
env:
- name: ENVIRONMENT
value: "production"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: FILEBEAT_PASSWORD
valueFrom:
secretKeyRef:
name: elastic-credentials
key: filebeat-password
securityContext:
runAsUser: 0
capabilities:
add:
- DAC_READ_SEARCH
resources:
limits:
memory: 200Mi
cpu: 100m
requests:
cpu: 100m
memory: 100Mi
volumeMounts:
- name: config
mountPath: /etc/filebeat.yml
readOnly: true
subPath: filebeat.yml
- name: data
mountPath: /usr/share/filebeat/data
- name: varlibdockercontainers
mountPath: /var/lib/docker/containers
readOnly: true
- name: varlog
mountPath: /var/log
readOnly: true
- name: certs
mountPath: /usr/share/filebeat/certs
readOnly: true
volumes:
- name: config
configMap:
defaultMode: 0640
name: filebeat-config
- name: varlibdockercontainers
hostPath:
path: /var/lib/docker/containers
- name: varlog
hostPath:
path: /var/log
- name: data
hostPath:
path: /var/lib/filebeat-data
type: DirectoryOrCreate
- name: certs
secret:
secretName: elastic-certificates
Jaeger分布式追踪
Jaeger集群部署
# jaeger-operator.yaml
apiVersion: jaegertracing.io/v1
kind: Jaeger
metadata:
name: jaeger-production
spec:
strategy: production
collector:
replicas: 3
maxReplicas: 10
resources:
requests:
memory: 1Gi
cpu: 500m
limits:
memory: 2Gi
cpu: 1
options:
collector:
num-workers: 100
queue-size: 5000
kafka:
producer:
topic: jaeger-spans
brokers: kafka-cluster:9092
batch-size: 1000
batch-timeout: 1s
query:
replicas: 2
resources:
requests:
memory: 512Mi
cpu: 250m
limits:
memory: 1Gi
cpu: 500m
options:
query:
max-clock-skew-adjustment: 0s
ingester:
replicas: 3
resources:
requests:
memory: 1Gi
cpu: 500m
limits:
memory: 2Gi
cpu: 1
options:
ingester:
deadlockInterval: 1m
kafka:
consumer:
topic: jaeger-spans
brokers: kafka-cluster:9092
group-id: jaeger-ingester
storage:
type: elasticsearch
elasticsearch:
nodeCount: 3
redundancyPolicy: SingleRedundancy
resources:
requests:
memory: 2Gi
cpu: 1
limits:
memory: 4Gi
cpu: 2
storage:
storageClassName: fast-ssd
size: 100Gi
options:
es:
server-urls: https://elasticsearch:9200
username: jaeger
password: ${JAEGER_PASSWORD}
tls:
ca: /etc/ssl/certs/ca.crt
index-prefix: jaeger
create-index-templates: true
version: 8
agent:
strategy: DaemonSet
resources:
requests:
memory: 128Mi
cpu: 100m
limits:
memory: 256Mi
cpu: 200m
options:
agent:
log-level: info
---
# jaeger-sampling-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: jaeger-sampling-config
data:
sampling.json: |
{
"service_strategies": [
{
"service": "frontend",
"type": "probabilistic",
"param": 1.0
},
{
"service": "backend-api",
"type": "probabilistic",
"param": 0.5
},
{
"service": "database-service",
"type": "probabilistic",
"param": 0.1
},
{
"service": "batch-processor",
"type": "probabilistic",
"param": 0.01
}
],
"default_strategy": {
"type": "probabilistic",
"param": 0.1
},
"per_operation_strategies": [
{
"operation": "health-check",
"type": "probabilistic",
"param": 0.0
},
{
"operation": "metrics",
"type": "probabilistic",
"param": 0.0
}
]
}
应用集成示例
// tracing.go - Go应用集成Jaeger
package main
import (
"context"
"fmt"
"io"
"log"
"net/http"
"time"
"github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go/ext"
"github.com/uber/jaeger-client-go"
"github.com/uber/jaeger-client-go/config"
"github.com/uber/jaeger-client-go/rpcmetrics"
"github.com/uber/jaeger-lib/metrics"
"github.com/uber/jaeger-lib/metrics/prometheus"
)
// 初始化Jaeger追踪器
func initJaeger(service string) (opentracing.Tracer, io.Closer) {
cfg := config.Configuration{
ServiceName: service,
Sampler: &config.SamplerConfig{
Type: jaeger.SamplerTypeRemote,
Param: 1,
SamplingServerURL: "http://jaeger-agent:5778/sampling",
},
Reporter: &config.ReporterConfig{
LogSpans: true,
BufferFlushInterval: 1 * time.Second,
LocalAgentHostPort: "jaeger-agent:6831",
},
}
// 添加Prometheus指标
metricsFactory := prometheus.New()
metrics := rpcmetrics.NewObserver(metricsFactory, rpcmetrics.DefaultNameNormalizer)
tracer, closer, err := cfg.NewTracer(
config.Logger(jaeger.StdLogger),
config.Metrics(metricsFactory),
config.Observer(metrics),
)
if err != nil {
log.Fatal("Cannot create tracer", err)
}
opentracing.SetGlobalTracer(tracer)
return tracer, closer
}
// HTTP中间件
func tracingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
spanCtx, _ := opentracing.GlobalTracer().Extract(opentracing.HTTPHeaders, opentracing.HTTPHeadersCarrier(r.Header))
span := opentracing.GlobalTracer().StartSpan(r.URL.Path, ext.RPCServerOption(spanCtx))
defer span.Finish()
// 设置标签
ext.HTTPMethod.Set(span, r.Method)
ext.HTTPUrl.Set(span, r.URL.String())
ext.Component.Set(span, "http-server")
// 将span添加到context
ctx := opentracing.ContextWithSpan(r.Context(), span)
r = r.WithContext(ctx)
// 创建响应写入器包装器来捕获状态码
rw := &responseWriter{ResponseWriter: w, statusCode: http.StatusOK}
next.ServeHTTP(rw, r)
// 设置响应状态码
ext.HTTPStatusCode.Set(span, uint16(rw.statusCode))
if rw.statusCode >= 400 {
ext.Error.Set(span, true)
}
})
}
type responseWriter struct {
http.ResponseWriter
statusCode int
}
func (rw *responseWriter) WriteHeader(code int) {
rw.statusCode = code
rw.ResponseWriter.WriteHeader(code)
}
// 数据库操作追踪
func queryDatabase(ctx context.Context, query string) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "database.query")
defer span.Finish()
// 设置数据库相关标签
ext.DBType.Set(span, "postgresql")
ext.DBStatement.Set(span, query)
span.SetTag("db.instance", "production")
// 模拟数据库查询
time.Sleep(50 * time.Millisecond)
// 如果有错误,标记span
if err := simulateDBError(); err != nil {
ext.Error.Set(span, true)
span.LogFields(
log.String("event", "error"),
log.String("message", err.Error()),
)
return err
}
return nil
}
// 外部API调用追踪
func callExternalAPI(ctx context.Context, url string) (*http.Response, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "http.client")
defer span.Finish()
req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
// 注入追踪头
opentracing.GlobalTracer().Inject(
span.Context(),
opentracing.HTTPHeaders,
opentracing.HTTPHeadersCarrier(req.Header),
)
// 设置标签
ext.HTTPMethod.Set(span, "GET")
ext.HTTPUrl.Set(span, url)
ext.Component.Set(span, "http-client")
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Do(req)
if err != nil {
ext.Error.Set(span, true)
span.LogFields(
log.String("event", "error"),
log.String("message", err.Error()),
)
return nil, err
}
ext.HTTPStatusCode.Set(span, uint16(resp.StatusCode))
return resp, nil
}
// 异步任务追踪
func processAsyncTask(ctx context.Context, taskID string) {
span, ctx := opentracing.StartSpanFromContext(ctx, "async.task")
defer span.Finish()
span.SetTag("task.id", taskID)
span.SetTag("task.type", "data-processing")
// 模拟任务处理步骤
steps := []string{"validate", "transform", "save", "notify"}
for i, step := range steps {
childSpan, childCtx := opentracing.StartSpanFromContext(ctx, fmt.Sprintf("task.%s", step))
childSpan.SetTag("step.number", i+1)
// 模拟处理时间
time.Sleep(time.Duration(100+i*50) * time.Millisecond)
childSpan.Finish()
ctx = childCtx
}
}
func simulateDBError() error {
// 模拟偶发的数据库错误
if time.Now().Unix()%10 == 0 {
return fmt.Errorf("database connection timeout")
}
return nil
}
func main() {
tracer, closer := initJaeger("my-service")
defer closer.Close()
http.Handle("/api/users", tracingMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
// 数据库查询
if err := queryDatabase(ctx, "SELECT * FROM users"); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
// 外部API调用
if _, err := callExternalAPI(ctx, "https://api.external.com/data"); err != nil {
log.Printf("External API call failed: %v", err)
}
// 启动异步任务
go processAsyncTask(ctx, "task-123")
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
})))
log.Println("Server starting on :8080")
log.Fatal(http.ListenAndServe(":8080", nil))
}
Grafana统一可视化
Grafana配置
# grafana-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-config
data:
grafana.ini: |
[server]
http_port = 3000
domain = grafana.company.com
root_url = https://grafana.company.com
[database]
type = postgres
host = postgres:5432
name = grafana
user = grafana
password = ${GRAFANA_DB_PASSWORD}
ssl_mode = require
[security]
admin_user = admin
admin_password = ${GRAFANA_ADMIN_PASSWORD}
secret_key = ${GRAFANA_SECRET_KEY}
disable_gravatar = true
cookie_secure = true
cookie_samesite = strict
[auth]
disable_login_form = false
disable_signout_menu = false
[auth.ldap]
enabled = true
config_file = /etc/grafana/ldap.toml
allow_sign_up = true
[smtp]
enabled = true
host = smtp.company.com:587
user = grafana@company.com
password = ${SMTP_PASSWORD}
from_address = grafana@company.com
from_name = Grafana
[alerting]
enabled = true
execute_alerts = true
[unified_alerting]
enabled = true
[log]
mode = console file
level = info
[metrics]
enabled = true
interval_seconds = 10
[tracing.jaeger]
address = jaeger-query:16686
always_included_tag = true
sampler_type = const
sampler_param = 1
ldap.toml: |
[[servers]]
host = "ldap.company.com"
port = 636
use_ssl = true
start_tls = false
ssl_skip_verify = false
bind_dn = "cn=grafana,ou=services,dc=company,dc=com"
bind_password = "${LDAP_BIND_PASSWORD}"
search_filter = "(uid=%s)"
search_base_dns = ["ou=users,dc=company,dc=com"]
[servers.attributes]
name = "givenName"
surname = "sn"
username = "uid"
member_of = "memberOf"
email = "mail"
[[servers.group_mappings]]
group_dn = "cn=grafana-admins,ou=groups,dc=company,dc=com"
org_role = "Admin"
[[servers.group_mappings]]
group_dn = "cn=grafana-editors,ou=groups,dc=company,dc=com"
org_role = "Editor"
[[servers.group_mappings]]
group_dn = "cn=grafana-viewers,ou=groups,dc=company,dc=com"
org_role = "Viewer"
datasources.yaml: |
apiVersion: 1
datasources:
# Prometheus数据源
- name: Prometheus
type: prometheus
access: proxy
url: http://thanos-query:9090
isDefault: true
editable: false
jsonData:
timeInterval: 15s
queryTimeout: 60s
httpMethod: POST
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: jaeger
# Jaeger数据源
- name: Jaeger
type: jaeger
access: proxy
url: http://jaeger-query:16686
uid: jaeger
editable: false
jsonData:
tracesToLogs:
datasourceUid: loki
tags: ['job', 'instance', 'pod', 'namespace']
mappedTags: [
{ key: 'service.name', value: 'service' },
{ key: 'service.namespace', value: 'namespace' }
]
mapTagNamesEnabled: true
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
filterByTraceID: true
filterBySpanID: true
# Elasticsearch数据源
- name: Elasticsearch
type: elasticsearch
access: proxy
url: https://elasticsearch:9200
database: "application-logs-*"
basicAuth: true
basicAuthUser: grafana
secureJsonData:
basicAuthPassword: ${ELASTICSEARCH_PASSWORD}
jsonData:
interval: Daily
timeField: "@timestamp"
esVersion: "8.0.0"
includeFrozen: false
logMessageField: message
logLevelField: level
maxConcurrentShardRequests: 5
# Loki数据源
- name: Loki
type: loki
access: proxy
url: http://loki:3100
uid: loki
editable: false
jsonData:
derivedFields:
- datasourceUid: jaeger
matcherRegex: "trace_id=(\\w+)"
name: TraceID
url: "$${__value.raw}"
dashboards.yaml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
- name: 'infrastructure'
orgId: 1
folder: 'Infrastructure'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/infrastructure
- name: 'applications'
orgId: 1
folder: 'Applications'
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards/applications
---
# grafana-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
spec:
replicas: 2
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
securityContext:
fsGroup: 472
runAsUser: 472
runAsNonRoot: true
containers:
- name: grafana
image: grafana/grafana:10.1.0
env:
- name: GRAFANA_DB_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-secrets
key: db-password
- name: GRAFANA_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-secrets
key: admin-password
- name: GRAFANA_SECRET_KEY
valueFrom:
secretKeyRef:
name: grafana-secrets
key: secret-key
- name: SMTP_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-secrets
key: smtp-password
- name: LDAP_BIND_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-secrets
key: ldap-bind-password
- name: ELASTICSEARCH_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-secrets
key: elasticsearch-password
ports:
- containerPort: 3000
name: http
volumeMounts:
- name: config
mountPath: /etc/grafana
- name: storage
mountPath: /var/lib/grafana
- name: dashboards
mountPath: /var/lib/grafana/dashboards
resources:
requests:
memory: 512Mi
cpu: 250m
limits:
memory: 1Gi
cpu: 500m
livenessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: config
configMap