|
| 1 | +endpoints: |
| 2 | + - name: "High CPU Usage Check" |
| 3 | + enabled: true |
| 4 | + group: "hardware" |
| 5 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100%20*%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%7Bmode!%3D%22idle%22%7D%5B2m%5D%29%29%20%2F%20sum%20by%28instance%29%20%28rate%28node_cpu_seconds_total%5B2m%5D%29%29" |
| 6 | + method: "GET" |
| 7 | + interval: "30s" |
| 8 | + conditions: |
| 9 | + - "[BODY].data.result[0].value[1] <= 80" |
| 10 | + metric: |
| 11 | + min: 0 |
| 12 | + max: 100 |
| 13 | + unit: "%" |
| 14 | + definition: |
| 15 | + title: "Configure your CPU Usage Alert" |
| 16 | + description: "Triggers if CPU usage exceeds the limit defined in the condition" |
| 17 | + priority: "medium" |
| 18 | + correlationId: "dms-cpu" |
| 19 | + isBanner: "false" |
| 20 | + alerts: |
| 21 | + - type: custom |
| 22 | + enabled: true |
| 23 | + description: "CPU % usage above [CONDITION_VALUE]" |
| 24 | + failure-threshold: 2 |
| 25 | + success-threshold: 1 |
| 26 | + send-on-resolved: true |
| 27 | + |
| 28 | + - name: "Host Memory Check" |
| 29 | + enabled: true |
| 30 | + group: "hardware" |
| 31 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=100*(1-node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)" |
| 32 | + method: "GET" |
| 33 | + interval: "30s" |
| 34 | + conditions: |
| 35 | + - "[BODY].data.result[0].value[1] <= 90" |
| 36 | + metric: |
| 37 | + min: 0 |
| 38 | + max: 100 |
| 39 | + unit: "%" |
| 40 | + definition: |
| 41 | + title: "Configure your Memory Usage Alert" |
| 42 | + description: "Triggers if memory usage exceeds the limit defined in the condition" |
| 43 | + priority: "medium" |
| 44 | + correlationId: "dms-memory" |
| 45 | + isBanner: "false" |
| 46 | + alerts: |
| 47 | + - type: custom |
| 48 | + enabled: true |
| 49 | + description: "Memory % usage above [CONDITION_VALUE]" |
| 50 | + failure-threshold: 2 |
| 51 | + success-threshold: 1 |
| 52 | + send-on-resolved: true |
| 53 | + |
| 54 | + - name: "Host Disk Space Check" |
| 55 | + enabled: true |
| 56 | + group: "hardware" |
| 57 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg((1-node_filesystem_avail_bytes%7Bfstype!~%22^(fuse.*|tmpfs|cifs|nfs)%22%7D/node_filesystem_size_bytes)*100)" |
| 58 | + method: "GET" |
| 59 | + interval: "30s" |
| 60 | + conditions: |
| 61 | + - "[BODY].data.result[0].value[1] <= 90" |
| 62 | + metric: |
| 63 | + min: 0 |
| 64 | + max: 100 |
| 65 | + unit: "%" |
| 66 | + definition: |
| 67 | + title: "Configure your Disk Space Alert" |
| 68 | + description: "Triggers if disk usage exceeds the limit defined in the condition" |
| 69 | + priority: "high" |
| 70 | + correlationId: "dms-disk" |
| 71 | + isBanner: "false" |
| 72 | + alerts: |
| 73 | + - type: custom |
| 74 | + enabled: true |
| 75 | + description: "Disk % usage above [CONDITION_VALUE]" |
| 76 | + failure-threshold: 2 |
| 77 | + success-threshold: 1 |
| 78 | + send-on-resolved: true |
| 79 | + |
| 80 | + - name: "Host Temperature Check" |
| 81 | + enabled: true |
| 82 | + group: "hardware" |
| 83 | + url: "http://prometheus.dms.dappnode:9090/api/v1/query?query=avg%28node_hwmon_temp_celsius%7Bchip%3D~%22.*coretemp.*%7C.*18_3%24%7C.*k10temp.*%22%7D%29" |
| 84 | + method: "GET" |
| 85 | + interval: "30s" |
| 86 | + conditions: |
| 87 | + - "[BODY].data.result[0].value[1] <= 85" |
| 88 | + metric: |
| 89 | + min: 25 |
| 90 | + max: 100 |
| 91 | + unit: "°C" |
| 92 | + definition: |
| 93 | + title: "Configure your Temperature Alert" |
| 94 | + description: "Triggers if the average node temperature exceeds the defined threshold" |
| 95 | + priority: "medium" |
| 96 | + correlationId: "dms-temperature" |
| 97 | + isBanner: "false" |
| 98 | + alerts: |
| 99 | + - type: custom |
| 100 | + enabled: true |
| 101 | + description: "Average node temperature above [CONDITION_VALUE]°C" |
| 102 | + failure-threshold: 2 |
| 103 | + success-threshold: 1 |
| 104 | + send-on-resolved: true |
0 commit comments