-
Notifications
You must be signed in to change notification settings - Fork 5
/
prometheus.rules.example.conf
143 lines (143 loc) · 6.87 KB
/
prometheus.rules.example.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
groups:
- name: cpuUsageAlert
rules:
- alert: cpu_usage_alert
expr: 100 - ((avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[15m]))) *100) > 95
for: 5m
labels:
adanos_level: error
adanos_tag: cpu_usage_alert
# adanos_id 用于区分事件,是事件抑制和自动恢复的关键
adanos_id: "cpu_usage_{{ $labels.instance }}"
# adanos_recovery_after 如果 adanos_recovery_after 时间内没有新的事件产生,则自动生成一条恢复事件,
adanos_recovery_after: 10m
# adanos_repeat_interval 如果 adanos_repeat_interval 内有新的 adanos_id 相同的事件产生,
# 则自动丢弃该事件,起到了事件抑制的作用
adanos_repeat_interval: 15m
adanos_server: "{{ $labels.instance }}"
adanos_environment: prod
annotations:
summary: "服务器 {{ $labels.instance }},CPU 使用率过高"
description: "CPU 使用率高于 95% (当前值 : {{ $value }}%)"
- name: memory_usage_alert
rules:
- alert: out_of_memory_imporant
expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 5
for: 10m
labels:
adanos_level: error
adanos_tag: out_of_memory
adanos_id: "out_of_memory_{{ $labels.instance }}"
adanos_recovery_after: 15m
adanos_repeat_interval: 10m
adanos_server: "{{ $labels.instance }}"
adanos_environment: prod
annotations:
summary: "服务器 {{ $labels.instance }},内存使用率过高"
description: "剩余内存不足 5% (当前值 : {{ $value }}%)"
- name: diskAlert
rules:
- alert: disk_space_alert
expr: 100 - ((node_filesystem_avail_bytes{device!~'rootfs',mountpoint!~'/mnt'} * 100) / node_filesystem_size_bytes{device!~'rootfs',mountpoint!~'/mnt'}) > 95
for: 10m
labels:
adanos_level: error
adanos_tag: disk_space_alert
adanos_id: "disk_space_{{ $labels.instance }}_{{ $labels.mountpoint }}"
adanos_recovery_after: 15m
adanos_repeat_interval: 30m
adanos_server: "{{ $labels.instance }}"
adanos_environment: prod
annotations:
summary: "服务器 {{ $labels.instance }},分区 {{ $labels.mountpoint }} 没有足够的可用空间"
description: "分区 {{ $labels.mountpoint }} 磁盘空间使用率高于 95% (当前值 : {{ $value }}%)"
- alert: disk_read_latency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
for: 10m
labels:
adanos_level: error
adanos_tag: disk_read_latency
adanos_id: "disk_read_latency_{{ $labels.instance }}_{{ $labels.mountpoint }}"
adanos_recovery_after: 15m
adanos_repeat_interval: 30m
adanos_server: "{{ $labels.instance }}"
adanos_environment: prod
annotations:
summary: "服务器 {{ $labels.instance }},分区 {{ $labels.mountpoint }} 读取延迟过高"
description: "分区 {{ $labels.mountpoint }} 磁盘读取延迟大于 100ms (当前值 : {{ $value }} ms)"
- alert: disk_write_latency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100
for: 10m
labels:
adanos_level: error
adanos_tag: disk_write_latency
adanos_id: "disk_wrote_latency_{{ $labels.instance }}_{{ $labels.mountpoint }}"
adanos_recovery_after: 15m
adanos_repeat_interval: 30m
adanos_server: "{{ $labels.instance }}"
adanos_environment: prod
annotations:
summary: "服务器 {{ $labels.instance }},分区 {{ $labels.mountpoint }} 写入延迟过高"
description: "分区 {{ $labels.mountpoint }} 磁盘写入延迟大于 100ms (当前值 : {{ $value }} ms)"
- alert: disk_outof_inodes
expr: node_filesystem_files_free{mountpoint!~'/mnt'} / node_filesystem_files{mountpoint!~'/mnt'} * 100 < 5
for: 10m
labels:
adanos_level: error
adanos_tag: disk_outof_inodes
adanos_id: "disk_outof_inodes_{{ $labels.instance }}_{{ $labels.mountpoint }}"
adanos_recovery_after: 15m
adanos_repeat_interval: 30m
adanos_server: "{{ $labels.instance }}"
adanos_environment: prod
annotations:
summary: "服务器 {{ $labels.instance }} ({{ $labels.mountpoint }}) inode 节点数量不足"
description: "inode 节点剩余不足 5% (当前值 : {{ $value }} %)"
- name: databaseUsageAlert
rules:
- alert: database_connection_high
expr: (max( (max_over_time(mysql_global_status_threads_connected[1m])/mysql_global_variables_max_connections) or (mysql_global_status_threads_connected/mysql_global_variables_max_connections) ) by (instance) ) * 100 > 80
for: 3m
labels:
adanos_level: warning
adanos_tag: database_connection_high
adanos_id: "database_connection_high_{{ $labels.name }}"
adanos_recovery_after: 5m
adanos_repeat_interval: 3m
adanos_server: "{{ $labels.name }}"
adanos_instance: "{{ $labels.instance }}"
adanos_server_ip: "{{ $labels.server_ip }}"
adanos_environment: prod
annotations:
summary: "数据库 {{ $labels.name }}, 连接数占用率较高,敬请留意"
description: "数据库连接数占用率高于 80% (当前值 : {{ $value }}%)"
- name: RabbitMQAlert
rules:
- alert: queue_message_alert
expr: sum(rabbitmq_queue_messages) by (queue) > 2000
for: 15m
labels:
adanos_level: error
adanos_tag: rabbitmq_queue_message_alert
adanos_id: "rabbitmq_queue_message_{{ $labels.queue }}"
adanos_recovery_after: 15m
adanos_repeat_interval: 10m
adanos_server: "{{ $labels.instance }}"
adanos_environment: prod
annotations:
summary: "RabbitMQ 队列 {{ $labels.queue }} 积压消息过多"
description: "积压消息数量超过 2000 条 (当前值 : {{ $value }})"
- alert: queue_failed_alert
expr: sum(rabbitmq_queue_messages{queue=~".*@failed"}) by (queue) > 0
for: 15m
labels:
adanos_level: error
adanos_tag: rabbitmq_failed_message_alert
adanos_id: "rabbitmq_failed_message_{{ $labels.queue }}"
adanos_recovery_after: 20m
adanos_repeat_interval: 1h
adanos_server: "{{ $labels.instance }}"
adanos_environment: prod
annotations:
summary: "RabbitMQ 失败任务队列 {{ $labels.queue }} 有积压消息"
description: "有 {{ $value }} 条积压错误消息待处理"