blackbox_rules.yml
groups:
- name: blackbox_alert
rules:
- alert: blackbox_alert
expr: probe_success == 0
for: 5m
labels:
severity: critical
annotations:
summary: "接口/主机/端口 {{ $labels.instance }} 无法联通"
description: "请尽快检测"
- alert: "ssl证书过期警告"
expr: (probe_ssl_earliest_cert_expiry - time())/86400 <30
for: 1h
labels:
severity: warn
annotations:
description: '域名{{$labels.instance}}的证书还有{{ printf "%.1f" $value }}天就过期了,请尽快更新证书'
summary: "ssl证书过期警告"
k8s_rules.yml
groups:
- name: node.rules
rules:
- alert: JobDown #检测job的状态,持续5分钟metrices不能访问会发给altermanager进行报警
expr: up == 0 #0不正常,1正常
for: 5m #持续时间 , 表示持续5分钟获取不到信息,则触发报警
labels:
severity: error
cluster: k8s
annotations:
summary: "Job: {{ $labels.job }} down"
description: "Instance:{{ $labels.instance }}, Job {{ $labels.job }} stop "
- alert: PodDown
expr: kube_pod_container_status_running != 1
for: 2s
labels:
severity: warning
cluster: k8s
annotations:
summary: 'Container: {{ $labels.container }} down'
description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} is not running'
- alert: PodReady
expr: kube_pod_container_status_ready != 1
for: 5m #Ready持续5分钟,说明启动有问题
labels:
severity: warning
cluster: k8s
annotations:
summary: 'Container: {{ $labels.container }} ready'
description: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} always ready for 5 minitue'
- alert: PodRestart
expr: changes(kube_pod_container_status_restarts_total[30m])>0 #最近30分钟pod重启
for: 2s
labels:
severity: warning
cluster: k8s
annotations:
summary: 'Container: {{ $labels.container }} restart'
description: 'namespace: {{ $labels.namespace }}, pod: {{ $labels.pod }} restart {{ $value }} times'
node_rules.yml
groups:
- name: 主机状态-监控告警 # 命名
rules:
- alert: 主机存活告警 # 命名
expr: up == 0 # 表达式,分析指标判定告警
for: 60s # 触发告警持续时间
labels: # 自定义告警标签
severity: warning
annotations: # 告警内容注释,根据需要制定
summary: "{{ $labels.instance }} 宕机超过1分钟!"
- alert: 主机CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
for: 15m
labels:
severity: warning
annotations:
summary: "CPU近15分钟使用率大于80%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
- alert: 主机内存使用率告警
expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100 > 85
for: 15m
labels:
severity: warning
annotations:
summary: "内存利用率大于85%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
# 磁盘利用>80%
- alert: 主机磁盘使用率告警
expr: 100 - node_filesystem_free_bytes{fstype=~"xfs|ext4"} / node_filesystem_size_bytes{fstype=~"xfs|ext4"} * 100 > 80
for: 15m
labels:
severity: warning
annotations:
summary: "磁盘使用率大于80%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
- alert: 数据目录可用量
expr: node_filesystem_avail_bytes{mountpoint="/data",fstype=~"ext4|xfs"} /1073741824 < 2
for: 1m
labels:
status: 严重
annotations:
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
mountpoint: "{{$labels.mountpoint}}"
summary: "实例在data挂载点磁盘可用量小于2G!, 当前可用: {{ $value }}G"
- alert: 根目录可用量
expr: node_filesystem_avail_bytes{mountpoint="/",fstype=~"ext4|xfs"} /1073741824 < 2
for: 1m
labels:
status: 严重
annotations:
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
mountpoint: "{{$labels.mountpoint}}"
summary: "实例在root挂载点磁盘可用量小于2G!, 当前可用: {{ $value }}G"
- alert: TCP连接数
expr: node_netstat_Tcp_CurrEstab > 10000
for: 2m
labels:
severity: 严重告警
annotations:
summary: " TCP_ESTABLISHED过高!"
description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
- alert: 主机Tcp TimeWait数量过多告警
expr: node_sockstat_TCP_tw >= 5000
for: 1m
labels:
severity: warning
annotations:
summary: "Tcp TimeWait数量大于5000, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
- alert: 主机iowait较高
expr: (sum(increase(node_cpu_seconds_total{mode='iowait'}[5m]))by(instance)) / (sum(increase(node_cpu_seconds_total[5m]))by(instance)) *100 >= 10
for: 5m
labels:
severity: warning
annotations:
summary: "CPU ioWait近5分钟占比大于等于10%, 实例: {{ $labels.instance }},当前值:{{ $value }}%"
- alert: 磁盘IO性能
expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高,请尽快处理!"
description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%."
- alert: 主机磁盘读过大
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50*1024 *1024
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘读过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。"
# 写入 > 50MB/s
- alert: 主机磁盘写过大
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50 * 1024 * 1024
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘写过大, 实例: {{$labels.instance}},当前值: {{ $value | humanize1024 }}。"
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}."
- alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}."
- alert: 系统15分钟负载告警
expr: node_load5 > 5.6
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 系统负载报警"
description: "服务器:{{$labels.alertname}},系统负载: 使用超过70%!当前值: {{ $value }}%)"
value: "{{ $value }}"