Skip to content

Commit 7aee1c0

Browse files
pablomendezroyoPablo MendezmateumirallesMarketen
authored
Add notifications (#81)
* Add notifications * update notifications * edit conditions * add temperature and host out of disk space metrics * fix typo * use average of cpu cores only * update definition * add prio * add correlationid and banner * set disk and temperature as banner * consistency with evaluation * use category hardware * use single line * dont use white space * update notifications yaml * Update copies * improve cpu query (#84) * fix cpu usange and temp --------- Co-authored-by: Pablo Mendez <pablo@dappnode.io> Co-authored-by: mateumiralles <mateumiralles714@gmail.com> Co-authored-by: Marc Font <36164126+Marketen@users.noreply.github.com> Co-authored-by: Marketen <marcfont12@gmail.com>
1 parent 15e152f commit 7aee1c0

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

notifications.yaml

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
endpoints:
2+
- name: "High CPU Usage Check"
3+
enabled: true
4+
group: "hardware"
5+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100%20*%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D%29%29%20%2F%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%5B2m%5D%29%29"
6+
method: "GET"
7+
interval: "30s"
8+
conditions:
9+
- "[BODY].data.result[0].value[1] <= 80"
10+
metric:
11+
min: 0
12+
max: 100
13+
unit: "%"
14+
definition:
15+
title: "Configure your CPU Usage Alert"
16+
description: "Triggers if CPU usage exceeds the limit defined in the condition"
17+
priority: "medium"
18+
correlationId: "dms-cpu"
19+
isBanner: "false"
20+
alerts:
21+
- type: custom
22+
enabled: true
23+
description: "CPU % usage above [CONDITION_VALUE]"
24+
failure-threshold: 2
25+
success-threshold: 1
26+
send-on-resolved: true
27+
28+
- name: "Host Memory Check"
29+
enabled: true
30+
group: "hardware"
31+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1-node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)"
32+
method: "GET"
33+
interval: "30s"
34+
conditions:
35+
- "[BODY].data.result[0].value[1] <= 90"
36+
metric:
37+
min: 0
38+
max: 100
39+
unit: "%"
40+
definition:
41+
title: "Configure your Memory Usage Alert"
42+
description: "Triggers if memory usage exceeds the limit defined in the condition"
43+
priority: "medium"
44+
correlationId: "dms-memory"
45+
isBanner: "false"
46+
alerts:
47+
- type: custom
48+
enabled: true
49+
description: "Memory % usage above [CONDITION_VALUE]"
50+
failure-threshold: 2
51+
success-threshold: 1
52+
send-on-resolved: true
53+
54+
- name: "Host Disk Space Check"
55+
enabled: true
56+
group: "hardware"
57+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1-node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)"
58+
method: "GET"
59+
interval: "30s"
60+
conditions:
61+
- "[BODY].data.result[0].value[1] <= 90"
62+
metric:
63+
min: 0
64+
max: 100
65+
unit: "%"
66+
definition:
67+
title: "Configure your Disk Space Alert"
68+
description: "Triggers if disk usage exceeds the limit defined in the condition"
69+
priority: "high"
70+
correlationId: "dms-disk"
71+
isBanner: "false"
72+
alerts:
73+
- type: custom
74+
enabled: true
75+
description: "Disk % usage above [CONDITION_VALUE]"
76+
failure-threshold: 2
77+
success-threshold: 1
78+
send-on-resolved: true
79+
80+
- name: "Host Temperature Check"
81+
enabled: true
82+
group: "hardware"
83+
url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%28node_hwmon_temp_celsius%7Bchip%3D~%22.*coretemp.*%7C.*18_3%24%7C.*k10temp.*%22%7D%29"
84+
method: "GET"
85+
interval: "30s"
86+
conditions:
87+
- "[BODY].data.result[0].value[1] <= 85"
88+
metric:
89+
min: 25
90+
max: 100
91+
unit: "°C"
92+
definition:
93+
title: "Configure your Temperature Alert"
94+
description: "Triggers if the average node temperature exceeds the defined threshold"
95+
priority: "medium"
96+
correlationId: "dms-temperature"
97+
isBanner: "false"
98+
alerts:
99+
- type: custom
100+
enabled: true
101+
description: "Average node temperature above [CONDITION_VALUE]°C"
102+
failure-threshold: 2
103+
success-threshold: 1
104+
send-on-resolved: true

0 commit comments

Comments
 (0)