-
Notifications
You must be signed in to change notification settings - Fork 1
/
node-alerts.yaml
284 lines (284 loc) · 13.7 KB
/
node-alerts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
kind: PrometheusRule
apiVersion: monitoring.coreos.com/v1
metadata:
prometheus: dx
spec:
groups:
- name: node-alert
interval: 10m
rules:
- expr: rate(node_disk_read_time_seconds_total{device=~"d.*"}[1m])/rate(node_disk_reads_completed_total{device=~"d.*"}[1m]) > 0.2
alert: '[P2]-Node 硬盘读取延迟大'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘读取延迟大 > 0.2s
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 硬盘读取延迟大'
type: node
- expr: abs(time()-node_time_seconds) > 100
alert: '[P1]-Node 系统时间与实际存在差异'
for: 3m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,系统时间与实际存在差异,请检查系统时间
level: P1
ruleGroupName: node-alert
ruleName: '[P1]-Node 系统时间与实际存在差异'
type: node
- expr: node_sockstat_sockets_used > 5000
alert: '[P3]-Node Socket连接数高'
for: 5m
labels:
severity: info
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,Socket连接数> 5000,实际连接数为{{ $value }},注意业务高峰
level: P3
ruleGroupName: node-alert
ruleName: '[P3]-Node Socket连接数高'
type: node
- expr: rate(node_disk_reads_completed_total[1m]) > 3000
alert: '[P2]-Node 硬盘读取IOPS高'
for: 3m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘读取IOPS>3000,实际IOPS为{{ humanize $value }},注意运行的job或业务高峰导致
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 硬盘读取IOPS高'
type: node
- expr: rate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|vir.*|lo.*|vnet.*|cni.*|kube.*|dummy.*'}[1m]) / 1024 / 1024 > 200
alert: '[P2]-Node 流入流量高'
for: 10m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,流入流量>200M,实际流量为{{ humanize $value }},注意运行的job或业务高峰导致
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 流入流量高'
type: node
- expr: node_filefd_allocated / node_filefd_maximum * 100 > 95
alert: '[P1]-Node 文件描述符使用率高'
for: 5m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,文件描述符使用率高> 95%,实际使用率为{{ humanize $value }}%,检查应用状态,检查系统限制值
level: P1
ruleGroupName: node-alert
ruleName: '[P1]-Node 文件描述符使用率高'
type: node
- expr: rate(node_disk_read_bytes_total[1m]) / 1024 / 1024 > 200
alert: '[P3]-Node 硬盘读取数据量高'
for: 10m
labels:
severity: info
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘读取数据量>200M,实际数据量为{{ humanize $value }},注意运行的job或业务高峰导致
level: P3
ruleGroupName: node-alert
ruleName: '[P3]-Node 硬盘读取数据量高'
type: node
- expr: rate(node_disk_writes_completed_total[1m]) > 3000
alert: '[P2]-Node 硬盘写入IOPS高'
for: 3m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘写入IOPS高>3000,实际IOPS为{{ humanize $value }},注意运行的job或业务高峰导致
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 硬盘写入IOPS高'
type: node
- expr: irate(node_disk_io_time_seconds_total[1m]) * 100 > 60
alert: '[P2]-Node 磁盘IO耗时占比高'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,系统磁盘IO耗时占比>60%,实际占比为{{ humanize $value }}%,检查应用状态
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 磁盘IO耗时占比高'
type: node
- expr: time()-node_boot_time_seconds < 100
alert: '[P0]-Node 服务器已重启'
for: 1m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,Uptime<100秒,服务器已重启
level: P0
ruleGroupName: node-alert
ruleName: '[P0]-Node 服务器已重启'
type: node
- expr: (100-(avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,namespace))*100) > 95
alert: '[P1]-Node CPU使用率高'
for: 5m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,CPU使用率>95%,实际使用率为{{ humanize $value }}%,找到对应进程,检查业务状态,留意业务高峰
level: P1
ruleGroupName: node-alert
ruleName: '[P1]-Node CPU使用率高'
type: node
- expr: (node_memory_MemTotal_bytes - node_memory_Buffers_bytes-node_memory_Cached_bytes - node_memory_MemFree_bytes - node_memory_Slab_bytes-node_memory_PageTables_bytes - node_memory_SwapCached_bytes) / node_memory_MemTotal_bytes *100 > 85
alert: '[P2]-Node 内存使用率高'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,内存使用率高> 85%,实际使用率为{{ humanize $value }}%,找到对应进程,留意业务高峰
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 内存使用率高'
type: node
- expr: (100-(node_filesystem_avail_bytes{device!~"rootfs"}/node_filesystem_size_bytes{device!~"rootfs"})*100) > 85
alert: '[P2]-Node 硬盘空间使用率高'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘空间使用率高> 85%,实际使用率为{{ humanize $value }}%,找到对应占用文件,清理不需要的日志
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 硬盘空间使用率高'
type: node
- expr: (avg by(instance,namespace)(rate(node_cpu_seconds_total{mode="iowait"}[1m]))*100) > 20
alert: '[P2]-Node CPU IO等待时间占比高'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,CPU IO等待时间占比>20%,实际占比为{{ humanize $value }}%,检查服务器IO状态
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node CPU IO等待时间占比高'
type: node
- expr: (node_memory_MemTotal_bytes - node_memory_Buffers_bytes-node_memory_Cached_bytes - node_memory_MemFree_bytes -node_memory_Slab_bytes-node_memory_PageTables_bytes - node_memory_SwapCached_bytes) / node_memory_MemTotal_bytes *100 > 95
alert: '[P1]-Node 内存使用率高'
for: 5m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,内存使用率高> 95%,实际使用率为{{ humanize $value }}%,找到对应进程,留意业务高峰
level: P1
ruleGroupName: node-alert
ruleName: '[P1]-Node 内存使用率高'
type: node
- expr: (100-(node_filesystem_avail_bytes{device!~"rootfs"}/node_filesystem_size_bytes{device!~"rootfs"})*100) > 95
alert: '[P1]-Node 硬盘空间使用率高'
for: 5m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘空间使用率高> 95%,实际使用率为{{ humanize $value }}%,找到对应占用文件,清理不需要的日志
level: P1
ruleGroupName: node-alert
ruleName: '[P1]-Node 硬盘空间使用率高'
type: node
- expr: rate(node_disk_write_time_seconds_total{device=~"d.*"}[1m]) / rate(node_disk_writes_completed_total{device=~"d.*"}[1m]) > 0.2
alert: '[P2]-Node 硬盘写入延迟大'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘写入延迟大 > 0.2s,检查应用状态
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 硬盘写入延迟大'
type: node
- expr: node_sockstat_TCP_tw > 5000
alert: '[P3]-Node TIME_WAIT连接数高'
for: 5m
labels:
severity: info
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,TIME_WAIT连接数> 5000,实际连接数为{{ $value }},注意业务高峰
level: P3
ruleGroupName: node-alert
ruleName: '[P3]-Node TIME_WAIT连接数高'
type: node
- expr: node_filefd_allocated / node_filefd_maximum * 100 > 80
alert: '[P2]-Node 文件描述符使用率高'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,文件描述符使用率高> 80%,实际使用率为{{ humanize $value }}%,检查应用状态,检查系统限制值
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 文件描述符使用率高'
type: node
- expr: (100-(avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,namespace))*100) > 85
alert: '[P2]-Node CPU使用率高'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,CPU使用率>85%,实际使用率为{{ humanize $value }}%,找到对应进程,检查业务状态,留意业务高峰
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node CPU使用率高'
type: node
- expr: (100-(node_filesystem_files_free{device!~"rootfs"}/node_filesystem_files{device!~"rootfs"}*100)) > 85
alert: '[P2]-Node 硬盘inodes使用率高'
for: 5m
labels:
severity: warn
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘inodes使用率> 85%,实际使用率为{{ humanize $value }}%,找到对应占用文件,清理不需要的日志
level: P2
ruleGroupName: node-alert
ruleName: '[P2]-Node 硬盘inodes使用率高'
type: node
- expr: (100-(node_filesystem_files_free{device!~"rootfs",mountpoint=~"/|/app|home"}/node_filesystem_files{device!~"rootfs",mountpoint=~"/|/app|home"}*100)) > 95
alert: '[P1]-Node 硬盘inodes使用率高'
for: 5m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘inodes使用率> 95%,实际使用率为{{ humanize $value }}%,找到对应占用文件,清理不需要的日志
level: P1
ruleGroupName: node-alert
ruleName: '[P1]-Node 硬盘inodes使用率高'
type: node
- expr: rate(node_disk_written_bytes_total{device=~"d.*"}[1m])/ 1024 / 1024 > 200
alert: '[P3]-Node 硬盘写入数据量高'
for: 10m
labels:
severity: info
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,硬盘写入数据量>200M,实际数据量为{{ humanize $value }},注意运行的job或业务高峰导致
level: P3
ruleGroupName: node-alert
ruleName: '[P3]-Node 硬盘写入数据量高'
type: node
- expr: (rate(node_network_receive_errs_total{device!~'tap.*|veth.*|br.*|docker.*|vir.*|lo.*|vnet.*|cni.*|kube.*|dummy.*'}[1m]) / rate(node_network_receive_packets_total{device!~'t*|br.*|docker.*|vir.*|lo.*|vnet.*|cni.*|kube.*|dummy.*'}[1m]))*100 > 10
alert: '[P1]-Node 数据包接收错误率高'
for: 3m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器,数据包接收错误率>10%,实际错误率为{{ humanize $value }}
level: P1
ruleGroupName: node-alert
ruleName: '[P1]-Node 数据包接收错误率高'
type: node
- expr: up{instance=~".*9100"}==0
alert: '[P1]-Node-Exporter Down'
for: 3m
labels:
severity: error
annotations:
description: 实例为 {{ $labels.instance }} 的服务器中Node_Exporter未运行
level: P1
ruleGroupName: node-alert
ruleName: '[P1]-Node-Exporter Down'
type: node