Skip to content

Commit 4e51a8a

Browse files
authored
feat: flux alerts + optional alerting to Slack (#169)
* configured optional alerting routes to Fluence Slack channel * alerts for fluxcd
1 parent 70b40ad commit 4e51a8a

File tree

3 files changed

+225
-3
lines changed

3 files changed

+225
-3
lines changed
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
name: flux-system
5+
labels:
6+
app.kubernetes.io/part-of: spectrum-monitoring
7+
app.kubernetes.io/component: flux
8+
spec:
9+
groups:
10+
- name: flux-system
11+
rules:
12+
- alert: HelmReleaseNotReady
13+
expr: gotk_resource_info{customresource_kind="HelmRelease", ready!="True"} > 0
14+
for: 5m
15+
labels:
16+
severity: critical
17+
service: fluxcd
18+
annotations:
19+
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready"
20+
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is in an unready state for more than 15 minutes."
21+
22+
- alert: GitRepositorySyncFailed
23+
expr: gotk_resource_info{customresource_kind="GitRepository", ready!="True"} > 0
24+
for: 5m
25+
labels:
26+
severity: critical
27+
service: fluxcd
28+
annotations:
29+
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
30+
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."
31+
32+
- alert: KustomizationNotApplied
33+
expr: gotk_resource_info{customresource_kind="Kustomization", ready!="True"} > 0
34+
for: 5m
35+
labels:
36+
severity: critical
37+
service: fluxcd
38+
annotations:
39+
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not applied"
40+
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not successfully applied for more than 15 minutes."
41+
42+
- alert: ImageRepositorySyncFailed
43+
expr: gotk_resource_info{customresource_kind="ImageRepository", ready!="True"} > 0
44+
for: 5m
45+
labels:
46+
severity: critical
47+
service: fluxcd
48+
annotations:
49+
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
50+
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."
51+
52+
- alert: HelmChartFailed
53+
expr: gotk_resource_info{customresource_kind="HelmChart", ready!="True"} > 0
54+
for: 5m
55+
labels:
56+
severity: critical
57+
service: fluxcd
58+
annotations:
59+
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has failed"
60+
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready for more than 15 minutes."
61+
62+
- alert: HelmReleaseSuspended
63+
expr: gotk_resource_info{customresource_kind="HelmRelease", suspended="true"} > 0
64+
for: 5m
65+
labels:
66+
severity: warning
67+
service: fluxcd
68+
annotations:
69+
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
70+
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
71+
72+
- alert: GitRepositorySuspended
73+
expr: gotk_resource_info{customresource_kind="GitRepository", suspended="true"} > 0
74+
for: 5m
75+
labels:
76+
severity: warning
77+
service: fluxcd
78+
annotations:
79+
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
80+
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
81+
82+
- alert: KustomizationSuspended
83+
expr: gotk_resource_info{customresource_kind="Kustomization", suspended="true"} > 0
84+
for: 5m
85+
labels:
86+
severity: warning
87+
service: fluxcd
88+
annotations:
89+
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
90+
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
91+
92+
- alert: ImageRepositorySuspended
93+
expr: gotk_resource_info{customresource_kind="ImageRepository", suspended="true"} > 0
94+
for: 5m
95+
labels:
96+
severity: warning
97+
service: fluxcd
98+
annotations:
99+
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
100+
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
101+
102+
- alert: HelmChartSuspended
103+
expr: gotk_resource_info{customresource_kind="HelmChart", suspended="true"} > 0
104+
for: 5m
105+
labels:
106+
severity: warning
107+
service: fluxcd
108+
annotations:
109+
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
110+
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."

flux/components/monitoring/configs/flux/kustomization.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ kind: Kustomization
33
namespace: monitoring
44
resources:
55
- podmonitor.yaml
6+
- alerts.yaml
67
configMapGenerator:
78
- name: flux-grafana-dashboards
89
files:

flux/components/monitoring/controllers/kube-prometheus-stack/release.yaml

Lines changed: 114 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ kind: HelmRelease
33
metadata:
44
name: kube-prometheus-stack
55
spec:
6-
interval: 1h
6+
interval: 5m
77
chartRef:
88
kind: OCIRepository
99
name: kube-prometheus-stack
@@ -29,18 +29,129 @@ spec:
2929
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml
3030
values:
3131
alertmanager:
32-
enabled: false
32+
enabled: true
33+
alertmanagerSpec:
34+
priorityClassName: spectrum-monitoring
35+
secrets: [
36+
slack-api-url,
37+
slack-templates
38+
]
39+
40+
config:
41+
route:
42+
group_by:
43+
- alertname
44+
- namespace
45+
- severity
46+
group_wait: 30s
47+
group_interval: 5m
48+
repeat_interval: 7d
49+
receiver: blackhole
50+
routes:
51+
- receiver: 'slack-all'
52+
matchers:
53+
- service != "fluxcd"
54+
- severity =~ "warning|critical"
55+
- receiver: 'slack-fluxcd'
56+
matchers:
57+
- service = "fluxcd"
58+
receivers:
59+
- name: blackhole
60+
- name: slack-all
61+
slack_configs:
62+
- channel: '#{{- template "slack_channel_main" . -}}'
63+
api_url_file: /etc/alertmanager/secrets/slack-api-url/slack-all
64+
send_resolved: true
65+
title: '{{ template "slack.main.title" . }}'
66+
text: '{{ template "slack.main.text" . }}'
67+
icon_url: https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Prometheus_software_logo.svg/500px-Prometheus_software_logo.svg.png
68+
- name: slack-fluxcd
69+
slack_configs:
70+
- channel: '#{{- template "slack_channel_flux" . -}}'
71+
api_url_file: /etc/alertmanager/secrets/slack-api-url/slack-fluxcd
72+
send_resolved: true
73+
title: '{{ template "slack.main.title" . }}'
74+
text: '{{ template "slack.main.text" . }}'
75+
icon_url: https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Prometheus_software_logo.svg/500px-Prometheus_software_logo.svg.png
76+
templates:
77+
- '/etc/alertmanager/config/*.tmpl'
78+
- '/etc/alertmanager/secrets/slack-templates/*.tmpl'
79+
templateFiles:
80+
template_1.tmpl: |-
81+
{{ define "__main_title" }}
82+
[{{ .Status | toUpper }}
83+
{{- if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{- template "provider" . -}}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}} {{ with .GroupLabels }} {{- .Values | join " " }} {{- end -}}
84+
{{ end }}
85+
86+
{{ define "__main_body_firing" }}
87+
{{ range .Alerts }}
88+
*Cluster:* {{ template "provider" . }}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}}
89+
*Summary:* {{ .Annotations.summary }}
90+
*Description:* {{ .Annotations.description }}
91+
*Since:* {{ .StartsAt.Local.Format "02/01/06 15:04 UTC" }}
92+
*Details:*
93+
{{ range .Labels.SortedPairs }}• {{ .Name }}: `{{ .Value }}`
94+
{{ end }}{{ end }}
95+
{{ end }}
96+
97+
{{ define "__main_body_resolved" }}
98+
{{ range .Alerts }}
99+
*Cluster:* {{ template "provider" . }}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}}
100+
*Message:* {{ if .Annotations.resolved }}{{ .Annotations.resolved }}{{ else }}{{ .Annotations.summary }}{{ end }}
101+
*Description:* {{ if .Annotations.resolved }}{{ .Annotations.resolved }}{{ else }}{{ .Annotations.description }}{{ end }}
102+
*Started at:* {{ .StartsAt.Local.Format "02/01/06 15:04 UTC" }}
103+
*Ended at:* {{ .EndsAt.Local.Format "02/01/06 15:04 UTC" }}
104+
*Details:*
105+
{{ range .Labels.SortedPairs }}• {{ .Name }}: `{{ .Value }}`
106+
{{ end }}{{ end }}
107+
{{ end }}
108+
109+
{{ define "slack.main.title" }}{{ template "__main_title" . }}{{ end }}
110+
111+
{{ define "slack.main.text" }}
112+
{{ if eq (len .Alerts.Firing) 1 -}}
113+
{{ template "__main_body_firing" . }}
114+
{{- else if gt (len .Alerts.Firing) 1 -}}
115+
*Alerts:* {{ template "__main_body_firing" . }}
116+
{{- else -}}
117+
{{ template "__main_body_resolved" . }}
118+
{{- end -}}
119+
{{ end }}
120+
121+
{{ define "slack.main.dashboard" }}{{ template "__main_dashboard" . }}{{ end }}
122+
{{ define "slack.main.link" }}{{ template "__main_link" . }}{{ end }}
123+
{{ define "slack.main.silence" }}{{ template "__main_silence" . }}{{ end }}
124+
{{ define "slack.main.explore" }}{{ template "__main_explore" . }}{{ end }}
125+
customRules:
126+
KubeStateMetricsListErrors:
127+
severity: info
128+
KubeClientCertificateExpiration:
129+
severity: info
130+
KubeControllerManagerDown:
131+
severity: info
132+
KubeSchedulerDown:
133+
severity: info
134+
PrometheusNotConnectedToAlertmanagers:
135+
severity: info
136+
PrometheusDuplicateTimestamps:
137+
severity: info
138+
PrometheusRuleFailures:
139+
severity: info
140+
KubeProxyDown:
141+
severity: info
142+
33143
prometheusOperator:
34144
priorityClassName: spectrum-monitoring
35145
prometheus:
36146
prometheusSpec:
37147
priorityClassName: spectrum-monitoring
38-
retention: 168h
148+
retention: 200h
39149
resources:
40150
requests:
41151
cpu: 200m
42152
memory: 200Mi
43153

154+
ruleSelectorNilUsesHelmValues: false
44155
serviceMonitorNamespaceSelector: {}
45156
serviceMonitorSelector:
46157
matchExpressions:

0 commit comments

Comments
 (0)