Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions configs/rules/alerts-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ groups:
annotations:
summary: "VMStorage high memory usage"
description: "VMStorage instance {{ $labels.instance }} is using more than 90% of available memory.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vm_app_memory_usage_bytes{job="vmstorage"} / vm_app_memory_limit_bytes{job="vmstorage"} > 0.9
expr: go_memstats_alloc_bytes{job="vmstorage"} > 500000000
for: 5m
labels:
severity: warning
Expand All @@ -56,7 +56,7 @@ groups:
annotations:
summary: "VMInsert high memory usage"
description: "VMInsert instance {{ $labels.instance }} is using more than 90% of available memory.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vm_app_memory_usage_bytes{job="vminsert"} / vm_app_memory_limit_bytes{job="vminsert"} > 0.9
expr: go_memstats_alloc_bytes{job="vminsert"} > 200000000
for: 5m
labels:
severity: warning
Expand All @@ -66,7 +66,7 @@ groups:
annotations:
summary: "VMSelect high memory usage"
description: "VMSelect instance {{ $labels.instance }} is using more than 90% of available memory.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vm_app_memory_usage_bytes{job="vmselect"} / vm_app_memory_limit_bytes{job="vmselect"} > 0.9
expr: go_memstats_alloc_bytes{job="vmselect"} > 200000000
for: 5m
labels:
severity: warning
Expand All @@ -76,7 +76,7 @@ groups:
annotations:
summary: "VMStorage disk space is running low"
description: "VMStorage instance {{ $labels.instance }} has less than 15% free disk space.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vm_free_disk_space_bytes{job="vmstorage"} / vm_data_size_bytes{job="vmstorage"} < 0.15
expr: vm_free_disk_space_bytes{job="vmstorage"} / (sum(vm_data_size_bytes{job="vmstorage"}) by (instance) + vm_free_disk_space_bytes{job="vmstorage"}) < 0.15
for: 10m
labels:
severity: warning
Expand All @@ -86,7 +86,7 @@ groups:
annotations:
summary: "VMStorage disk space is critically low"
description: "VMStorage instance {{ $labels.instance }} has less than 5% free disk space.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vm_free_disk_space_bytes{job="vmstorage"} / vm_data_size_bytes{job="vmstorage"} < 0.05
expr: vm_free_disk_space_bytes{job="vmstorage"} / (sum(vm_data_size_bytes{job="vmstorage"}) by (instance) + vm_free_disk_space_bytes{job="vmstorage"}) < 0.05
for: 5m
labels:
severity: critical
Expand Down
12 changes: 6 additions & 6 deletions configs/rules/alerts-container-health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ groups:
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
labels:
severity: warning
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
expr: (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100 < 10) * on(instance) group_left (nodename) node_uname_info{job="node-exporter",nodename=~".+"}
for: 2m

- alert: NodeMemoryUnderMemoryPressure
Expand All @@ -17,7 +17,7 @@ groups:
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
labels:
severity: warning
expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
expr: (rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{job="node-exporter",nodename=~".+"}
for: 2m

- alert: NodeFilesystemAlmostOutOfSpace
Expand All @@ -26,9 +26,9 @@ groups:
summary: "Filesystem has less than 5% space left."
expr: |
(
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node",fstype!=""} == 0
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
Expand All @@ -40,9 +40,9 @@ groups:
summary: "Filesystem has less than 3% space left."
expr: |
(
node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node",fstype!=""} == 0
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
Expand Down
6 changes: 3 additions & 3 deletions configs/rules/alerts-vmagent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ groups:
annotations:
summary: "VMAgent lost connection to remote storage"
description: "VMAgent instance {{ $labels.instance }} cannot connect to remote write endpoint.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vmagent_remotewrite_conn_state{job="vmagent"} == 0
expr: vmagent_remotewrite_errors_total{job="vmagent"} > 0
for: 2m
labels:
severity: critical
Expand All @@ -36,7 +36,7 @@ groups:
annotations:
summary: "VMAgent high memory usage"
description: "VMAgent instance {{ $labels.instance }} is using more than 90% of available memory.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vm_app_memory_usage_bytes{job="vmagent"} / vm_app_memory_limit_bytes{job="vmagent"} > 0.9
expr: go_memstats_alloc_bytes{job="vmagent"} > 200000000
for: 5m
labels:
severity: warning
Expand Down Expand Up @@ -66,7 +66,7 @@ groups:
annotations:
summary: "VMAgent configuration reload failed"
description: "VMAgent instance {{ $labels.instance }} failed to reload configuration.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vmagent_config_last_reload_successful{job="vmagent"} == 0
expr: vm_promscrape_config_last_reload_successful{job="vmagent"} == 0
for: 2m
labels:
severity: warning
Expand Down
4 changes: 2 additions & 2 deletions configs/rules/alerts-vmalert.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ groups:
annotations:
summary: "VMAlert datasource is unavailable"
description: "VMAlert instance {{ $labels.instance }} cannot reach datasource {{ $labels.addr }}.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vmalert_datasource_requests_errors_total > 0
expr: vmalert_datasource_dial_errors_total > 0
for: 5m
labels:
severity: critical
Expand All @@ -56,7 +56,7 @@ groups:
annotations:
summary: "VMAlert high memory usage"
description: "VMAlert instance {{ $labels.instance }} is using more than 90% of available memory.\nVALUE = {{ $value }}\nLABELS = {{ $labels }}"
expr: vm_app_memory_usage_bytes{job="vmalert"} / vm_app_memory_limit_bytes{job="vmalert"} > 0.9
expr: go_memstats_alloc_bytes{job="vmalert"} > 100000000
for: 5m
labels:
severity: warning
Expand Down