Skip to content

Commit

Permalink
[kube-prometheus-stack]: add additional alert labels on prometheusRul…
Browse files Browse the repository at this point in the history
…es (prometheus-community#247)

* [kube-prometheus-stack]: add additional alert labels on prometheusRules

Signed-off-by: Bertrand Mailhe <bmailhe@leadformance.com>

* doc(kube-prometheus-stack): add defaultRules.kube-prometheus-stack parameter

Signed-off-by: Bertrand Mailhe <bmailhe@leadformance.com>

* bump version

Signed-off-by: Bertrand Mailhe <bmailhe@leadformance.com>
  • Loading branch information
bmailhe authored Oct 30, 2020
1 parent 72672f0 commit 86947d2
Show file tree
Hide file tree
Showing 33 changed files with 619 additions and 7 deletions.
2 changes: 1 addition & 1 deletion charts/kube-prometheus-stack/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ name: kube-prometheus-stack
sources:
- https://github.com/prometheus-community/helm-charts
- https://github.com/prometheus-operator/kube-prometheus
version: 10.3.2
version: 10.3.3
appVersion: 0.42.1
tillerVersion: ">=2.12.0"
kubeVersion: ">=1.16.0-0"
Expand Down
24 changes: 23 additions & 1 deletion charts/kube-prometheus-stack/hack/sync_prometheus_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import requests
import yaml
from yaml.representer import SafeRepresenter

import re

# https://stackoverflow.com/a/20863889/961092
class LiteralStr(str):
Expand Down Expand Up @@ -217,6 +217,27 @@ def add_rules_conditions(rules, indent=4):
return rules


def add_custom_labels(rules, indent=4):
"""Add if wrapper for additional rules labels"""
rule_condition = '{{- if .Values.defaultRules.additionalRuleLabels }}\n{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}\n{{- end }}'
rule_condition_len = len(rule_condition) + 1

separator = " " * indent + "- alert:.*"
alerts_positions = re.finditer(separator,rules)
alert=-1
for alert_position in alerts_positions:
# add rule_condition at the end of the alert block
if alert >= 0 :
index = alert_position.start() + rule_condition_len * alert - 1
rules = rules[:index] + "\n" + rule_condition + rules[index:]
alert += 1

# add rule_condition at the end of the last alert
if alert >= 0:
index = len(rules) - 1
rules = rules[:index] + "\n" + rule_condition + rules[index:]
return rules

def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes):
fix_expr(group['rules'])
group_name = group['name']
Expand All @@ -231,6 +252,7 @@ def write_group_to_file(group, url, destination, min_kubernetes, max_kubernetes)
if replacement_map[line]['init']:
init_line += '\n' + replacement_map[line]['init']
# append per-alert rules
rules = add_custom_labels(rules)
rules = add_rules_conditions(rules)
# initialize header
lines = header % {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,19 @@ spec:
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerFailedReload
annotations:
message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: AlertmanagerMembersInconsistent
annotations:
message: Alertmanager has not found all other members of the cluster.
Expand All @@ -58,4 +64,7 @@ spec:
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,39 @@ spec:
for: 3m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdInsufficientMembers
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
for: 3m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdNoLeader
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfLeaderChanges
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
expr: increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3
for: 5m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -71,6 +83,9 @@ spec:
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -82,6 +97,9 @@ spec:
for: 5m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdGRPCRequestsSlow
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -91,6 +109,9 @@ spec:
for: 10m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdMemberCommunicationSlow
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -100,13 +121,19 @@ spec:
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedProposals
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighFsyncDurations
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -116,6 +143,9 @@ spec:
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighCommitDurations
annotations:
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -125,6 +155,9 @@ spec:
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
Expand All @@ -134,6 +167,9 @@ spec:
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
Expand All @@ -143,6 +179,9 @@ spec:
for: 10m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: etcdHTTPRequestsSlow
annotations:
message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
Expand All @@ -152,4 +191,7 @@ spec:
for: 10m
labels:
severity: warning
{{- end }}
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ spec:
for: 10m
labels:
severity: warning
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: Watchdog
annotations:
message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
Expand All @@ -47,4 +50,7 @@ spec:
expr: vector(1)
labels:
severity: none
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ spec:
long: 1h
severity: critical
short: 5m
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
Expand All @@ -52,6 +55,9 @@ spec:
long: 6h
severity: critical
short: 30m
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
Expand All @@ -66,6 +72,9 @@ spec:
long: 1d
severity: warning
short: 2h
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
Expand All @@ -80,4 +89,7 @@ spec:
long: 3d
severity: warning
short: 6h
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ spec:
for: 15m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
Expand All @@ -50,4 +53,7 @@ spec:
for: 15m
labels:
severity: critical
{{- if .Values.defaultRules.additionalRuleLabels }}
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
{{- end }}
{{- end }}
Loading

0 comments on commit 86947d2

Please sign in to comment.