From 467de5c0dc59923d44d1876ac98d43f42603fb8a Mon Sep 17 00:00:00 2001 From: Wesley Bermbach Date: Tue, 29 Oct 2019 16:29:13 +0100 Subject: [PATCH] Add remote alertmanager for operators --- .ci/generate_monitoring_docs | 12 +- .../templates/secret-alerting-smtp.yaml | 1 - .../templates/secret-alerting.yaml | 1 + .../controller-manager/deployment.yaml | 2 +- .../templates/_secret-alerting-smtp.yaml | 26 ---- .../templates/_secret-alerting.yaml | 43 ++++++ charts/gardener/values.yaml | 5 +- .../templates/alertmanager/alertmanager.yaml | 2 + .../templates/alertmanager/config.yaml | 2 + charts/seed-bootstrap/values.yaml | 1 + .../charts/prometheus/templates/config.yaml | 23 +++ .../prometheus/templates/prometheus.yaml | 9 ++ .../charts/core/charts/prometheus/values.yaml | 15 ++ docs/README.md | 4 + docs/monitoring/alerting.md | 137 ++++++++++++++++++ .../operator_alerts.md | 0 .../user_alerts.md | 0 docs/usage/configuration.md | 13 +- example/10-secret-alerting-smtp.yaml | 17 --- example/10-secret-alerting.yaml | 36 +++++ .../v1alpha1/constants/types_constants.go | 2 - pkg/operation/botanist/monitoring.go | 91 +++++++++++- pkg/operation/common/types.go | 3 + pkg/operation/garden/garden.go | 44 ++++-- pkg/operation/operation.go | 8 +- pkg/operation/seed/seed.go | 42 ++++-- 26 files changed, 451 insertions(+), 88 deletions(-) delete mode 100644 charts/gardener/charts/application/templates/secret-alerting-smtp.yaml create mode 100644 charts/gardener/charts/application/templates/secret-alerting.yaml delete mode 100644 charts/gardener/charts/utils-common/templates/_secret-alerting-smtp.yaml create mode 100644 charts/gardener/charts/utils-common/templates/_secret-alerting.yaml create mode 100644 docs/monitoring/alerting.md rename docs/{development => monitoring}/operator_alerts.md (100%) rename docs/{development => monitoring}/user_alerts.md (100%) delete mode 100644 example/10-secret-alerting-smtp.yaml create mode 100644 example/10-secret-alerting.yaml diff --git a/.ci/generate_monitoring_docs b/.ci/generate_monitoring_docs index 7dc3b4ee95f..79464a3153e 100755 --- a/.ci/generate_monitoring_docs +++ b/.ci/generate_monitoring_docs @@ -38,25 +38,25 @@ for t in $tools; do done pushd $SOURCE_PATH/charts/seed-monitoring/charts/core/charts/prometheus > /dev/null -cat < $SOURCE_PATH/docs/development/user_alerts.md +cat < $SOURCE_PATH/docs/monitoring/user_alerts.md # User Alerts |Alertname|Severity|Type|Description| |---|---|---|---| EOF -cat < $SOURCE_PATH/docs/development/operator_alerts.md +cat < $SOURCE_PATH/docs/monitoring/operator_alerts.md # Operator Alerts |Alertname|Severity|Type|Description| |---|---|---|---| EOF for file in rules/*.yaml; do - cat $file | yaml2json | jq -r '.groups | .[].rules | map(select(.labels.visibility == "owner" or .labels.visibility == "all")) | map(select(has("alert"))) | .[] | "|" + .alert + "|" + .labels.severity + "|" + .labels.type + "|" + "`" + .annotations.description + "`" + "|"' >> $SOURCE_PATH/docs/development/user_alerts.md - cat $file | yaml2json | jq -r '.groups | .[].rules | map(select(.labels.visibility == "operator" or .labels.visibility == "all")) | map(select(has("alert"))) | .[] | "|" + .alert + "|" + .labels.severity + "|" + .labels.type + "|" + "`" + .annotations.description + "`" + "|"' >> $SOURCE_PATH/docs/development/operator_alerts.md + cat $file | yaml2json | jq -r '.groups | .[].rules | map(select(.labels.visibility == "owner" or .labels.visibility == "all")) | map(select(has("alert"))) | .[] | "|" + .alert + "|" + .labels.severity + "|" + .labels.type + "|" + "`" + .annotations.description + "`" + "|"' >> $SOURCE_PATH/docs/monitoring/user_alerts.md + cat $file | yaml2json | jq -r '.groups | .[].rules | map(select(.labels.visibility == "operator" or .labels.visibility == "all")) | map(select(has("alert"))) | .[] | "|" + .alert + "|" + .labels.severity + "|" + .labels.type + "|" + "`" + .annotations.description + "`" + "|"' >> $SOURCE_PATH/docs/monitoring/operator_alerts.md done popd > /dev/null if [ -n "$(git status --porcelain)" ]; then - git add $SOURCE_PATH/docs/development/user_alerts.md - git add $SOURCE_PATH/docs/development/operator_alerts.md + git add $SOURCE_PATH/docs/monitoring/user_alerts.md + git add $SOURCE_PATH/docs/monitoring/operator_alerts.md git commit -m "Update alert documentation" else echo "no changes"; diff --git a/charts/gardener/charts/application/templates/secret-alerting-smtp.yaml b/charts/gardener/charts/application/templates/secret-alerting-smtp.yaml deleted file mode 100644 index 404f5b8db53..00000000000 --- a/charts/gardener/charts/application/templates/secret-alerting-smtp.yaml +++ /dev/null @@ -1 +0,0 @@ -{{- include "gardener.secret-alerting-smtp" . }} diff --git a/charts/gardener/charts/application/templates/secret-alerting.yaml b/charts/gardener/charts/application/templates/secret-alerting.yaml new file mode 100644 index 00000000000..24759cd43af --- /dev/null +++ b/charts/gardener/charts/application/templates/secret-alerting.yaml @@ -0,0 +1 @@ +{{- include "gardener.secret-alerting" . }} diff --git a/charts/gardener/charts/runtime/templates/controller-manager/deployment.yaml b/charts/gardener/charts/runtime/templates/controller-manager/deployment.yaml index 3cc13ad8d2d..d2425e61e1f 100644 --- a/charts/gardener/charts/runtime/templates/controller-manager/deployment.yaml +++ b/charts/gardener/charts/runtime/templates/controller-manager/deployment.yaml @@ -27,7 +27,7 @@ spec: checksum/secret-gardener-controller-manager-kubeconfig: {{ include (print $.Template.BasePath "/controller-manager/secret-kubeconfig.yaml") . | sha256sum }} checksum/secret-default-domain: {{ include "gardener.secret-default-domain" . | sha256sum }} checksum/secret-internal-domain: {{ include "gardener.secret-internal-domain" . | sha256sum }} - checksum/secret-alerting-smtp: {{ include "gardener.secret-alerting-smtp" . | sha256sum }} + checksum/secret-alerting: {{ include "gardener.secret-alerting" . | sha256sum }} checksum/secret-openvpn-diffie-hellman: {{ include "gardener.secret-openvpn-diffie-hellman" . | sha256sum }} labels: app: gardener diff --git a/charts/gardener/charts/utils-common/templates/_secret-alerting-smtp.yaml b/charts/gardener/charts/utils-common/templates/_secret-alerting-smtp.yaml deleted file mode 100644 index f862e441ff6..00000000000 --- a/charts/gardener/charts/utils-common/templates/_secret-alerting-smtp.yaml +++ /dev/null @@ -1,26 +0,0 @@ -{{- define "gardener.secret-alerting-smtp" -}} -{{- if .Values.global.controller.enabled }} -{{- range $key, $config := .Values.global.controller.alertingSMTP }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: alerting-smtp-{{ $key }} - namespace: garden - labels: - app: gardener - chart: "{{ $.Chart.Name }}-{{ $.Chart.Version }}" - release: "{{ $.Release.Name }}" - heritage: "{{ $.Release.Service }}" - garden.sapcloud.io/role: alerting-smtp -type: Opaque -data: - to: {{ ( required ".controller.alertingSMTP[].to is required" $config.to ) | b64enc }} - from: {{ ( required ".controller.alertingSMTP[].from is required" $config.from ) | b64enc }} - smarthost: {{ ( required ".controller.alertingSMTP[].smarthost is required" $config.smarthost ) | b64enc }} - auth_username: {{ ( required ".controller.alertingSMTP[].auth_username is required" $config.auth_username ) | b64enc }} - auth_identity: {{ ( required ".controller.alertingSMTP[].auth_identity is required" $config.auth_identity ) | b64enc }} - auth_password: {{ ( required ".controller.alertingSMTP[].auth_password is required" $config.auth_password ) | b64enc }} -{{- end }} -{{- end }} -{{- end -}} diff --git a/charts/gardener/charts/utils-common/templates/_secret-alerting.yaml b/charts/gardener/charts/utils-common/templates/_secret-alerting.yaml new file mode 100644 index 00000000000..f0b3bdd645c --- /dev/null +++ b/charts/gardener/charts/utils-common/templates/_secret-alerting.yaml @@ -0,0 +1,43 @@ +{{- define "gardener.secret-alerting" -}} +{{- if .Values.global.controller.enabled }} +{{- range $key, $config := .Values.global.controller.alerting }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: alerting-{{ $key }} + namespace: garden + labels: + app: gardener + chart: "{{ $.Chart.Name }}-{{ $.Chart.Version }}" + release: "{{ $.Release.Name }}" + heritage: "{{ $.Release.Service }}" + gardener.cloud/role: alerting +type: Opaque +data: + auth_type: {{ ( required ".controller.alerting[].auth_type is required" $config.auth_type ) | b64enc }} +{{- if eq $config.auth_type "smtp" }} + to: {{ ( required ".controller.alerting[].to is required" $config.to ) | b64enc }} + from: {{ ( required ".controller.alerting[].from is required" $config.from ) | b64enc }} + smarthost: {{ ( required ".controller.alerting[].smarthost is required" $config.smarthost ) | b64enc }} + auth_username: {{ ( required ".controller.alerting[].auth_username is required" $config.auth_username ) | b64enc }} + auth_identity: {{ ( required ".controller.alerting[].auth_identity is required" $config.auth_identity ) | b64enc }} + auth_password: {{ ( required ".controller.alerting[].auth_password is required" $config.auth_password ) | b64enc }} +{{- end }} +{{- if eq $config.auth_type "none" }} + url: {{ ( required ".controller.alerting[].url is required" $config.url ) | b64enc }} +{{- end }} +{{- if eq $config.auth_type "basic" }} + url: {{ ( required ".controller.alerting[].url is required" $config.url ) | b64enc }} + username: {{ ( required ".controller.alerting[].username is required" $config.username ) | b64enc }} + password: {{ ( required ".controller.alerting[].password is required" $config.password ) | b64enc }} +{{- end }} +{{- if eq $config.auth_type "certificate" }} + url: {{ ( required ".controller.alerting[].url is required" $config.url ) | b64enc }} + ca.crt: {{ ( required ".controller.alerting[].ca_crt is required" $config.ca_crt ) | b64enc }} + tls.crt: {{ ( required ".controller.alerting[].tls_crt is required" $config.tls_cert ) | b64enc }} + tls.key: {{ ( required ".controller.alerting[].tls_key is required" $config.tls_key ) | b64enc }} +{{- end }} +{{- end }} +{{- end }} +{{- end -}} diff --git a/charts/gardener/values.yaml b/charts/gardener/values.yaml index 8650a6da24c..bfbb410d3d6 100644 --- a/charts/gardener/values.yaml +++ b/charts/gardener/values.yaml @@ -148,8 +148,9 @@ global: # provider: aws-route53 # depends on the DNS extension of your choice # credentials: {} # # actual keys here depend on the DNS extension of your choice - alertingSMTP: [] - # - to: email-address-to-send-alerts-to + alerting: [] + # - auth_type: smtp + # to: email-address-to-send-alerts-to # from: email-address-to-send-alerts-from # smarthost: smtp-host-used-for-sending # auth_username: smtp-authentication-username diff --git a/charts/seed-bootstrap/templates/alertmanager/alertmanager.yaml b/charts/seed-bootstrap/templates/alertmanager/alertmanager.yaml index 73536dc77f6..08d0ed8f6c6 100644 --- a/charts/seed-bootstrap/templates/alertmanager/alertmanager.yaml +++ b/charts/seed-bootstrap/templates/alertmanager/alertmanager.yaml @@ -1,3 +1,4 @@ +{{ if .Values.alertmanager.enabled }} apiVersion: v1 kind: Service metadata: @@ -141,3 +142,4 @@ spec: resources: requests: storage: {{ .Values.alertmanager.storage }} +{{- end }} \ No newline at end of file diff --git a/charts/seed-bootstrap/templates/alertmanager/config.yaml b/charts/seed-bootstrap/templates/alertmanager/config.yaml index 0438f0fc602..ca7c59d148c 100644 --- a/charts/seed-bootstrap/templates/alertmanager/config.yaml +++ b/charts/seed-bootstrap/templates/alertmanager/config.yaml @@ -1,3 +1,4 @@ +{{ if .Values.alertmanager.enabled }} apiVersion: v1 kind: Secret metadata: @@ -5,3 +6,4 @@ metadata: namespace: {{ .Release.Namespace }} data: alertmanager.yaml: {{ include "config" .Values.alertmanager | b64enc }} +{{- end }} diff --git a/charts/seed-bootstrap/values.yaml b/charts/seed-bootstrap/values.yaml index 19b68d4edc5..9ea3de4d27e 100644 --- a/charts/seed-bootstrap/values.yaml +++ b/charts/seed-bootstrap/values.yaml @@ -115,6 +115,7 @@ fluentd-es: alertmanager: emailConfigs: [] + enabled: true storage: 1Gi hvpa: diff --git a/charts/seed-monitoring/charts/core/charts/prometheus/templates/config.yaml b/charts/seed-monitoring/charts/core/charts/prometheus/templates/config.yaml index f2200a1d658..abd38ae75c7 100644 --- a/charts/seed-monitoring/charts/core/charts/prometheus/templates/config.yaml +++ b/charts/seed-monitoring/charts/core/charts/prometheus/templates/config.yaml @@ -29,6 +29,29 @@ data: - /etc/prometheus/rules/*.yaml alerting: alertmanagers: +{{- if hasKey .Values.alerting.auth_type "none" }} + - static_configs: + - targets: + - {{ .Values.alerting.auth_type.none.url }} +{{- end }} +{{- if hasKey .Values.alerting.auth_type "basic" }} + - static_configs: + - targets: + - {{ .Values.alerting.auth_type.basic.url }} + basic_auth: + username: {{ .Values.alerting.auth_type.basic.username }} + password: {{ .Values.alerting.auth_type.basic.password }} +{{- end }} +{{- if hasKey .Values.alerting.auth_type "certificate" }} + - static_configs: + - targets: + - {{ .Values.alerting.auth_type.certificate.url }} + tls_config: + ca_file: /etc/prometheus/operator/ca.crt + cert_file: /etc/prometheus/operator/tls.crt + key_file: /etc/prometheus/operator/tls.key + insecure_skip_verify: {{ .Values.alerting.auth_type.certificate.insecure_skip_verify }} +{{- end }} - kubernetes_sd_configs: - role: endpoints namespaces: diff --git a/charts/seed-monitoring/charts/core/charts/prometheus/templates/prometheus.yaml b/charts/seed-monitoring/charts/core/charts/prometheus/templates/prometheus.yaml index 2d74ebfdcdf..a948bdbf428 100644 --- a/charts/seed-monitoring/charts/core/charts/prometheus/templates/prometheus.yaml +++ b/charts/seed-monitoring/charts/core/charts/prometheus/templates/prometheus.yaml @@ -137,6 +137,10 @@ spec: # we mount the Shoot cluster's CA and certs - mountPath: /etc/prometheus/seed name: prometheus-kubeconfig +{{- if hasKey .Values.alerting.auth_type "certificate" }} + - mountPath: /etc/prometheus/operator + name: prometheus-remote-am-tls +{{- end }} - image: {{ index .Values.images "vpn-seed" }} imagePullPolicy: IfNotPresent name: vpn-seed @@ -259,6 +263,11 @@ spec: - name: blackbox-exporter-config-prometheus configMap: name: blackbox-exporter-config-prometheus +{{- if hasKey .Values.alerting.auth_type "certificate" }} + - name: prometheus-remote-am-tls + secret: + secretName: prometheus-remote-am-tls +{{- end }} volumeClaimTemplates: - metadata: name: prometheus-db diff --git a/charts/seed-monitoring/charts/core/charts/prometheus/values.yaml b/charts/seed-monitoring/charts/core/charts/prometheus/values.yaml index 148c360340a..d56f1c2e303 100644 --- a/charts/seed-monitoring/charts/core/charts/prometheus/values.yaml +++ b/charts/seed-monitoring/charts/core/charts/prometheus/values.yaml @@ -324,6 +324,21 @@ rules: enabled: false rules: false +alerting: + auth_type: {} +# none: +# url: foo.bar +# basic: +# url: foo.bar +# username: admin +# password: password +# certificate: +# url: foo.bar +# ca.crt: ca +# tls.crt: certificate +# tls.key: key +# insecure_skip_verify: false + ignoreAlerts: false # object can be any object you want to scale Prometheus on: diff --git a/docs/README.md b/docs/README.md index 1bbc3860688..24bd14f4ba4 100644 --- a/docs/README.md +++ b/docs/README.md @@ -74,3 +74,7 @@ * [Deploying the Gardener into a Kubernetes cluster](deployment/kubernetes.md) * [Deploying the Gardener and a Seed into an AKS cluster](deployment/aks.md) * [Overwrite image vector](deployment/image_vector.md) + +## Monitoring + +* [Alerting](monitoring/alerting.md) diff --git a/docs/monitoring/alerting.md b/docs/monitoring/alerting.md new file mode 100644 index 00000000000..49ee8ac9f4e --- /dev/null +++ b/docs/monitoring/alerting.md @@ -0,0 +1,137 @@ +# Alerting + +Gardener uses [Prometheus](https://prometheus.io/) to gather metrics from each component. A Prometheus is deployed in each shoot control plane (on the seed) which is responsible for gathering control plane and cluster metrics. Prometheus can be configured to fire alerts based on these metrics and send them to an [alertmanager](https://prometheus.io/docs/alerting/alertmanager/). The alertmanager is responsible for sending the alerts to users and operators. This document describes how to setup alerting for: + +- [end-users/stakeholders/customers](#Alerting-for-Users) +- [operators/administrators](#Alerting-for-Operators) + +# Alerting for Users + +To receive email alerts as a user set the following values in the shoot spec: + +```yaml +spec: + monitoring: + alerting: + emailReceivers: + - john.doe@example.com +``` +`emailReceivers` is a list of emails that will receive alerts if something is wrong with the shoot cluster. A list of alerts for users can be found [here](user_alerts.md). + +# Alerting for Operators + +Currently, Gardener supports two options for alerting: + +- [Email Alerting](#Email-Alerting) +- [Sending Alerts to an external alertmanager](#External-Alertmanager) + +A list of operator alerts can be found [here](operator_alerts.md). + +## Email Alerting + +Gardener provides the option to deploy an alertmanager into each seed. This alertmanager is responsible for sending out alerts to operators for each shoot cluster in the seed. Only email alerts are supported by the alertmanager managed by Gardener. This is configurable by setting the Gardener controller manager configuration values `alerting`. See [this](../usage/configuration.md) on how to configure the Gardener's SMTP secret. If the values are set, a secret with the label `gardener.cloud/role: alerting` will be created in the garden namespace of the garden cluster. This secret will be used by each alertmanager in each seed. + +## External Alertmanager + +The alertmanager supports different kinds of [alerting configurations](https://prometheus.io/docs/alerting/configuration/). The alertmanager provided by Gardener only supports email alerts. If email is not sufficient, then alerts can be sent to an external alertmanager. Prometheus will send alerts to a URL and then alerts will be handled by the external alertmanager. This external alertmanager is operated and configured by the operator (i.e. Gardener does not configure or deploy this alertmanager). To configure sending alerts to an external alertmanager, create a secret in the virtual garden cluster in the garden namespace with the label: `gardener.cloud/role: alerting`. This secret needs to contain a URL to the the external alertmanager and information regarding authentication. Supported authentication types are: + +- No Authentication (none) +- Basic Authentication (basic) +- Mutual TLS (certificate) + +### Remote Alertmanager Examples + +Note: the `url` value cannot be prepended with `http` or `https`. + +```yaml +# No Authentication +apiVersion: v1 +kind: Secret +metadata: + labels: + gardener.cloud/role: alerting + name: alerting-auth + namespace: garden +data: + # No Authentication + auth_type: base64(none) + url: base64(external.alertmanager.foo) + + # Basic Auth + auth_type: base64(basic) + url: base64(extenal.alertmanager.foo) + username: base64(admin) + password: base64(password) + + # Mutual TLS + auth_type: base64(certificate) + url: base64(external.alertmanager.foo) + ca.crt: base64(ca) + tls.crt: base64(certificate) + tls.key: base64(key) + + # Email Alerts (internal alertmanager) + auth_type: base64(smtp) + auth_identity: base64(internal.alertmanager.auth_identity) + auth_password: base64(internal.alertmanager.auth_password) + auth_username: base64(internal.alertmanager.auth_username) + from: base64(internal.alertmanager.from) + smarthost: base64(internal.alertmanager.smarthost) + to: base64(internal.alertmanager.to) +type: Opaque +``` + +### Configuring your External Alertmanager + +Please refer to the [alertmanager](https://prometheus.io/docs/alerting/alertmanager/) documentation on how to configure an alertmanager. + +We recommend you use at least the following inhibition rules in your alertmanager configuration to prevent excessive alerts: +```yaml +inhibit_rules: +# Apply inhibition if the alert name is the same. +- source_match: + severity: critical + target_match: + severity: warning + equal: ['alertname', 'service', 'cluster'] + +# Stop all alerts for type=shoot if there are VPN problems. +- source_match: + service: vpn + target_match_re: + type: shoot + equal: ['type', 'cluster'] + +# Stop warning and critical alerts if there is a blocker - no workers nodes, no etcd main etc. +- source_match: + severity: blocker + target_match_re: + severity: ^(critical|warning)$ + equal: ['cluster'] + +# If the API server is down inhibit no worker nodes alert. No worker nodes depends on kube-state-metrics which depends on the API server. +- source_match: + service: kube-apiserver + target_match_re: + service: nodes + equal: ['cluster'] + +# If API server is down inhibit kube-state-metrics alerts. +- source_match: + service: kube-apiserver + target_match_re: + severity: info + equal: ['cluster'] + +# No Worker nodes depends on kube-state-metrics. Inhibit no worker nodes if kube-state-metrics is down. +- source_match: + service: kube-state-metrics-shoot + target_match_re: + service: nodes + equal: ['cluster'] +``` +Below is a graph visualizing the inhibition rules: + +![inhibitionGraph](../development/content/alertInhibitionGraph.png) + + diff --git a/docs/development/operator_alerts.md b/docs/monitoring/operator_alerts.md similarity index 100% rename from docs/development/operator_alerts.md rename to docs/monitoring/operator_alerts.md diff --git a/docs/development/user_alerts.md b/docs/monitoring/user_alerts.md similarity index 100% rename from docs/development/user_alerts.md rename to docs/monitoring/user_alerts.md diff --git a/docs/usage/configuration.md b/docs/usage/configuration.md index 8c854fc98d6..6c4dd2abfdc 100644 --- a/docs/usage/configuration.md +++ b/docs/usage/configuration.md @@ -48,10 +48,15 @@ When the `gardener-controller-manager` starts it scans the `garden` namespace of * Not every end-user/stakeholder/customer has its own domain, however, Gardener needs to create a DNS record for every shoot cluster. * As landscape operator you might want to define a default domain owned and controlled by you that is used for all shoot clusters that don't specify their own domain. -* **Alerting SMTP secrets** (optional), contain the SMTP credentials which will be used by the [AlertmMnager](https://prometheus.io/docs/alerting/alertmanager/) to send emails for alerts, please see [this](../../example/10-secret-alerting-smtp.yaml) for an example. - * These secrets are used by the AlertManager which is deployed next to the Kubernetes control plane of a shoot cluster in seed clusters. - * In case there have been alerting SMTP secrets configured, the Gardener will inject the credentials in the configuration of the AlertManager. - * It will use them to send mails to the stated email address in case anything is wrong with the Shoot clusters. +* **Alerting secrets** (optional), contain the alerting configuration and credentials for the [Alertmanager](https://prometheus.io/docs/alerting/alertmanager/) to send email alerts. It is also possible to configure the monitoring stack to send alerts to an alertmanager not deployed by Gardener to handle alerting. Please see [this](../../example/10-secret-alerting.yaml) for an example. + * If email alerting is configured: + * An Alertmanager is deployed into each seed cluster that handles the alerting for all shoots on the seed cluster. + * Gardener will inject the SMTP credentials into the configuration of the Alertmanager. + * The Alertmanager will send emails to the configured email address in case any alerts are firing. + * If an external alertmanager is configured: + * Each shoot has a [Prometheus](https://prometheus.io/docs/introduction/overview/) responsible for monitoring components and sending out alerts. The alerts will be sent to a URL configured in the alerting secret. + * This external alertmanager is not managed by Gardener and can be configured however the operator sees fit. + * Supported authentication types are no authentication, basic, or mutual TLS. * **OpenVPN Diffie-Hellmann Key secret** (optional), contains the self-generated Diffie-Hellmann key used by OpenVPN in your landscape, please see [this](../../example/10-secret-openvpn-diffie-hellman.yaml) for an example. * If you don't specify a custom key then a default key is used, but for productive landscapes it's recommend to create a landscape-specific key and define it. diff --git a/example/10-secret-alerting-smtp.yaml b/example/10-secret-alerting-smtp.yaml deleted file mode 100644 index 3926aa5ea80..00000000000 --- a/example/10-secret-alerting-smtp.yaml +++ /dev/null @@ -1,17 +0,0 @@ -# Secret containing SMTP configuration used by the AlertManager to send email alerts for Shoot clusters. ---- -apiVersion: v1 -kind: Secret -metadata: - name: alerting-smtp - namespace: garden - labels: - garden.sapcloud.io/role: alerting-smtp -type: Opaque -data: - to: base64(email-address-to-send-notifcations-to) - from: base64(email-address-to-send-notifcations-from) - smarthost: base64(smtp-host-used-for-sending) - auth_username: base64(smtp-authentication-username) - auth_identity: base64(smtp-authentication-username) - auth_password: base64(smtp-authentication-password) diff --git a/example/10-secret-alerting.yaml b/example/10-secret-alerting.yaml new file mode 100644 index 00000000000..37de47b7ca4 --- /dev/null +++ b/example/10-secret-alerting.yaml @@ -0,0 +1,36 @@ +# Secret containing SMTP configuration used by the AlertManager to send email alerts for Shoot clusters. +--- +apiVersion: v1 +kind: Secret +metadata: + name: alerting + namespace: garden + labels: + gardener.cloud/role: alerting +type: Opaque +data: + # No Authentication + auth_type: base64(none) + url: base64(external.alertmanager.foo) + + # Basic Auth + auth_type: base64(basic) + url: base64(extenal.alertmanager.foo) + username: base64(admin) + password: base64(password) + + # Mutual TLS + auth_type: base64(certificate) + url: base64(external.alertmanager.foo) + ca.crt: base64(ca) + tls.crt: base64(certificate) + tls.key: base64(key) + + # Email Alerts (internal alertmanager) + auth_type: base64(smtp) + auth_identity: base64(internal.alertmanager.auth_identity) + auth_password: base64(internal.alertmanager.auth_password) + auth_username: base64(internal.alertmanager.auth_username) + from: base64(internal.alertmanager.from) + smarthost: base64(internal.alertmanager.smarthost) + to: base64(internal.alertmanager.to) diff --git a/pkg/apis/core/v1alpha1/constants/types_constants.go b/pkg/apis/core/v1alpha1/constants/types_constants.go index 07e8f20fc91..80860e334b7 100644 --- a/pkg/apis/core/v1alpha1/constants/types_constants.go +++ b/pkg/apis/core/v1alpha1/constants/types_constants.go @@ -117,8 +117,6 @@ const ( GardenRole = "gardener.cloud/role" // GardenRoleExtension is a constant for a label that describes the 'extensions' role. GardenRoleExtension = "extension" - // GardenRoleAlertingSMTP is the value of the GardenRole key indicating type 'alerting-smtp'. - GardenRoleAlertingSMTP = "alerting-smtp" // GardenRoleSeed is the value of the GardenRole key indicating type 'seed'. GardenRoleSeed = "seed" // GardenRoleShoot is the value of the GardenRole key indicating type 'shoot'. diff --git a/pkg/operation/botanist/monitoring.go b/pkg/operation/botanist/monitoring.go index d4eac74626b..6f58a35c490 100644 --- a/pkg/operation/botanist/monitoring.go +++ b/pkg/operation/botanist/monitoring.go @@ -26,6 +26,7 @@ import ( "github.com/gardener/gardener/pkg/features" "github.com/gardener/gardener/pkg/operation/common" "github.com/gardener/gardener/pkg/utils" + kutil "github.com/gardener/gardener/pkg/utils/kubernetes" "github.com/gardener/gardener/pkg/utils/secrets" appsv1 "k8s.io/api/apps/v1" @@ -65,6 +66,11 @@ func (b *Botanist) DeploySeedMonitoring(ctx context.Context) error { usersDashboards.WriteString(fmt.Sprintln(cm.Data[v1alpha1constants.GrafanaConfigMapUserDashboard])) } + alerting, err := b.getCustomAlertingConfigs(ctx, b.GetSecretKeysOfRole(common.GardenRoleAlerting)) + if err != nil { + return err + } + var ( prometheusConfig = map[string]interface{}{ "kubernetesVersion": b.Shoot.Info.Spec.Kubernetes.Version, @@ -116,6 +122,7 @@ func (b *Botanist) DeploySeedMonitoring(ctx context.Context) error { "project": b.Garden.Project.Name, }, "ignoreAlerts": b.Shoot.IgnoreAlerts, + "alerting": alerting, "extensions": map[string]interface{}{ "rules": alertingRules.String(), "scrapeConfigs": scrapeConfigs.String(), @@ -174,13 +181,18 @@ func (b *Botanist) DeploySeedMonitoring(ctx context.Context) error { // Check if we want to deploy an alertmanager into the shoot namespace. if b.Shoot.WantsAlertmanager { var ( - alertingSMTPKeys = b.GetSecretKeysOfRole(common.GardenRoleAlertingSMTP) + alertingSMTPKeys = b.GetSecretKeysOfRole(common.GardenRoleAlerting) emailConfigs = []map[string]interface{}{} ) + if b.Shoot.Info.Spec.Monitoring != nil && b.Shoot.Info.Spec.Monitoring.Alerting != nil { for _, email := range b.Shoot.Info.Spec.Monitoring.Alerting.EmailReceivers { for _, key := range alertingSMTPKeys { secret := b.Secrets[key] + + if string(secret.Data["auth_type"]) != "smtp" { + continue + } emailConfigs = append(emailConfigs, map[string]interface{}{ "to": email, "from": string(secret.Data["from"]), @@ -216,6 +228,83 @@ func (b *Botanist) DeploySeedMonitoring(ctx context.Context) error { return nil } +func (b *Botanist) getCustomAlertingConfigs(ctx context.Context, alertingSecretKeys []string) (map[string]interface{}, error) { + configs := map[string]interface{}{ + "auth_type": map[string]interface{}{}, + } + + for _, key := range alertingSecretKeys { + secret := b.Secrets[key] + + if string(secret.Data["auth_type"]) == "none" { + + if url, ok := secret.Data["url"]; ok { + configs["auth_type"] = map[string]interface{}{ + "none": map[string]interface{}{ + "url": string(url), + }, + } + } + break + } + + if string(secret.Data["auth_type"]) == "basic" { + url, urlOk := secret.Data["url"] + username, usernameOk := secret.Data["username"] + password, passwordOk := secret.Data["password"] + + if urlOk && usernameOk && passwordOk { + configs["auth_type"] = map[string]interface{}{ + "basic": map[string]interface{}{ + "url": string(url), + "username": string(username), + "password": string(password), + }, + } + } + break + } + + if string(secret.Data["auth_type"]) == "certificate" { + data := map[string][]byte{} + url, urlOk := secret.Data["url"] + ca, caOk := secret.Data["ca.crt"] + cert, certOk := secret.Data["tls.crt"] + key, keyOk := secret.Data["tls.key"] + insecure, insecureOk := secret.Data["insecure_skip_verify"] + + if urlOk && caOk && certOk && keyOk && insecureOk { + configs["auth_type"] = map[string]interface{}{ + "certificate": map[string]interface{}{ + "url": string(url), + "insecure_skip_verify": string(insecure), + }, + } + data["ca.crt"] = ca + data["tls.crt"] = cert + data["tls.key"] = key + amSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "prometheus-remote-am-tls", + Namespace: b.Shoot.SeedNamespace, + }, + } + + if err := kutil.CreateOrUpdate(ctx, b.K8sSeedClient.Client(), amSecret, func() error { + amSecret.Data = data + amSecret.Type = corev1.SecretTypeOpaque + return nil + }); err != nil { + return nil, err + } + } + break + } + } + + return configs, nil +} + func (b *Botanist) deployGrafanaCharts(role, dashboards, basicAuth, subDomain string) error { values, err := b.InjectSeedShootImages(map[string]interface{}{ "ingress": map[string]interface{}{ diff --git a/pkg/operation/common/types.go b/pkg/operation/common/types.go index 090e9cc302f..919d86d87ba 100644 --- a/pkg/operation/common/types.go +++ b/pkg/operation/common/types.go @@ -146,6 +146,9 @@ const ( // GardenRoleGlobalMonitoring is the value of the GardenRole key indicating type 'global-monitoring' GardenRoleGlobalMonitoring = "global-monitoring" + // GardenRoleAlerting is the value of GardenRole key indicating type 'alerting'. + GardenRoleAlerting = "alerting" + // GardenRoleHvpa is the value of GardenRole key indicating type 'hvpa'. GardenRoleHvpa = "hvpa" diff --git a/pkg/operation/garden/garden.go b/pkg/operation/garden/garden.go index a530a105be4..7179ca95f8b 100644 --- a/pkg/operation/garden/garden.go +++ b/pkg/operation/garden/garden.go @@ -120,6 +120,7 @@ func ReadGardenSecrets(k8sInformers kubeinformers.SharedInformerFactory) (map[st secretsMap = make(map[string]*corev1.Secret) numberOfInternalDomainSecrets = 0 numberOfOpenVPNDiffieHellmanSecrets = 0 + numberOfAlertingSecrets = 0 ) selector, err := labels.Parse(v1alpha1constants.DeprecatedGardenRole) @@ -159,15 +160,6 @@ func ReadGardenSecrets(k8sInformers kubeinformers.SharedInformerFactory) (map[st numberOfInternalDomainSecrets++ } - // Retrieving alerting SMTP secrets based on all secrets in the Garden namespace which have - // a label indicating the Garden role alerting-smtp. - // Only when using the in-cluster config as we do not want to configure alerts in development modus. - if secret.Labels[v1alpha1constants.DeprecatedGardenRole] == common.GardenRoleAlertingSMTP { - alertingSMTP := secret - secretsMap[fmt.Sprintf("%s-%s", common.GardenRoleAlertingSMTP, secret.Name)] = alertingSMTP - logger.Logger.Infof("Found alerting SMTP secret %s.", secret.Name) - } - // Retrieving Diffie-Hellman secret for OpenVPN based on all secrets in the Garden namespace which have // a label indicating the Garden role openvpn-diffie-hellman. if secret.Labels[v1alpha1constants.DeprecatedGardenRole] == common.GardenRoleOpenVPNDiffieHellman { @@ -187,8 +179,32 @@ func ReadGardenSecrets(k8sInformers kubeinformers.SharedInformerFactory) (map[st monitoringSecret := secret secretsMap[common.GardenRoleGlobalMonitoring] = monitoringSecret logger.Logger.Infof("Found monitoring basic auth secret %s.", secret.Name) - } else { - logger.Logger.Info("No monitoring basic auth secret found.") + } + } + + selectorGardenRole, err := labels.Parse(v1alpha1constants.GardenRole) + if err != nil { + return nil, err + } + + secretsGardenRole, err := k8sInformers.Core().V1().Secrets().Lister().Secrets(v1alpha1constants.GardenNamespace).List(selectorGardenRole) + if err != nil { + return nil, err + } + + for _, secret := range secretsGardenRole { + + // Retrieve the alerting secret to configure alerting. Either in cluster email alerting or + // external alertmanager configuration. + if secret.Labels[v1alpha1constants.GardenRole] == common.GardenRoleAlerting { + authType := string(secret.Data["auth_type"]) + if authType != "smtp" && authType != "none" && authType != "basic" && authType != "certificate" { + return nil, fmt.Errorf("Invalid or missing field 'auth_type' in secret %s", secret.Name) + } + alertingSecret := secret + secretsMap[common.GardenRoleAlerting] = alertingSecret + logger.Logger.Infof("Found alerting secret %s.", secret.Name) + numberOfAlertingSecrets++ } } @@ -213,6 +229,12 @@ func ReadGardenSecrets(k8sInformers kubeinformers.SharedInformerFactory) (map[st return nil, fmt.Errorf("can only accept at most one OpenVPN Diffie Hellman secret, but found %d", numberOfOpenVPNDiffieHellmanSecrets) } + // Operators can configure gardener to send email alerts or send the alerts to an external alertmanager. If no configuration + // is provided then no alerts will be sent. + if numberOfAlertingSecrets > 1 { + return nil, fmt.Errorf("can only accept at most one alerting secret, but found %d", numberOfAlertingSecrets) + } + return secretsMap, nil } diff --git a/pkg/operation/operation.go b/pkg/operation/operation.go index 7f2e8e75e30..b23817ce1af 100644 --- a/pkg/operation/operation.go +++ b/pkg/operation/operation.go @@ -133,12 +133,8 @@ func newOperation( } func shootWantsAlertmanager(shoot *gardencorev1alpha1.Shoot, secrets map[string]*corev1.Secret) bool { - if alertingSMTPSecret := common.GetSecretKeysWithPrefix(v1alpha1constants.GardenRoleAlertingSMTP, secrets); len(alertingSMTPSecret) > 0 { - if shoot.Spec.Monitoring != nil && - shoot.Spec.Monitoring.Alerting != nil && - len(shoot.Spec.Monitoring.Alerting.EmailReceivers) > 0 { - return true - } + if shoot.Spec.Monitoring != nil && shoot.Spec.Monitoring.Alerting != nil && len(shoot.Spec.Monitoring.Alerting.EmailReceivers) > 0 { + return true } return false } diff --git a/pkg/operation/seed/seed.go b/pkg/operation/seed/seed.go index ef39a6ab4f1..95fa0cde823 100644 --- a/pkg/operation/seed/seed.go +++ b/pkg/operation/seed/seed.go @@ -398,20 +398,31 @@ func BootstrapCluster(seed *Seed, config *config.ControllerManagerConfiguration, "storage": seed.GetValidVolumeSize("1Gi"), } - if alertingSMTPKeys := common.GetSecretKeysWithPrefix(common.GardenRoleAlertingSMTP, secrets); len(alertingSMTPKeys) > 0 { + alertingSMTPKeys := common.GetSecretKeysWithPrefix(common.GardenRoleAlerting, secrets) + + if seedWantsAlertmanager(alertingSMTPKeys, secrets) { emailConfigs := make([]map[string]interface{}, 0, len(alertingSMTPKeys)) for _, key := range alertingSMTPKeys { - secret := secrets[key] - emailConfigs = append(emailConfigs, map[string]interface{}{ - "to": string(secret.Data["to"]), - "from": string(secret.Data["from"]), - "smarthost": string(secret.Data["smarthost"]), - "auth_username": string(secret.Data["auth_username"]), - "auth_identity": string(secret.Data["auth_identity"]), - "auth_password": string(secret.Data["auth_password"]), - }) + if string(secrets[key].Data["auth_type"]) == "smtp" { + secret := secrets[key] + emailConfigs = append(emailConfigs, map[string]interface{}{ + "to": string(secret.Data["to"]), + "from": string(secret.Data["from"]), + "smarthost": string(secret.Data["smarthost"]), + "auth_username": string(secret.Data["auth_username"]), + "auth_identity": string(secret.Data["auth_identity"]), + "auth_password": string(secret.Data["auth_password"]), + }) + alertManagerConfig["enabled"] = true + alertManagerConfig["emailConfigs"] = emailConfigs + break + } + } + } else { + alertManagerConfig["enabled"] = false + if err := common.DeleteAlertmanager(context.TODO(), k8sSeedClient.Client(), v1alpha1constants.GardenNamespace); err != nil { + return err } - alertManagerConfig["emailConfigs"] = emailConfigs } nodes := &corev1.NodeList{} @@ -640,3 +651,12 @@ func (s *Seed) GetValidVolumeSize(size string) string { return size } + +func seedWantsAlertmanager(keys []string, secrets map[string]*corev1.Secret) bool { + for _, key := range keys { + if string(secrets[key].Data["auth_type"]) == "smtp" { + return true + } + } + return false +}