[prometheus-alerts] Import the latest version of the chart from k8s-p…

…ublic-charts (#6)
Nextdoor · Apr 15, 2021 · ab32295 · ab32295
1 parent 2f1c256
commit ab32295
Show file tree

Hide file tree

Showing 8 changed files with 197 additions and 6 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -6,15 +6,15 @@ on:
       - 'charts/**'
 
 jobs:
-  lint-docs:
+  helm-docs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v1
       - name: Run helm-docs
         run: .github/helm-docs.sh
 
-  lint-test:
+  lint-and-test:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout

diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,7 @@
 # Helm chart automated files
 /charts/*/charts
 .idea
+
+# Files for diffing between templates
+new
+orig
diff --git a/charts/prometheus-alerts/Chart.yaml b/charts/prometheus-alerts/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: prometheus-alerts
 description: Helm Chart that provisions a series of common Prometheus Alerts
 type: application
-version: 0.1.0
+version: 0.1.3
 appVersion: 0.0.1
 maintainers:
   - name: diranged

diff --git a/charts/prometheus-alerts/README.md b/charts/prometheus-alerts/README.md
@@ -1,6 +1,6 @@
 # prometheus-alerts
 
-![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.0.1](https://img.shields.io/badge/AppVersion-0.0.1-informational?style=flat-square)
+![Version: 0.1.3](https://img.shields.io/badge/Version-0.1.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.0.1](https://img.shields.io/badge/AppVersion-0.0.1-informational?style=flat-square)
 
 Helm Chart that provisions a series of common Prometheus Alerts
 
@@ -22,6 +22,9 @@ Helm Chart that provisions a series of common Prometheus Alerts
 | alertManager.repeatInterval | string | `"1h"` | How long to wait before sending a notification again if it has already been sent successfully for an alert. (Usually ~3h or more). |
 | chart_name | string | `"prometheus-rules"` |  |
 | chart_source | string | `"https://github.com/Nextdoor/k8s-charts"` |  |
+| containerRules.CPUThrottlingHigh.for | string | `"15m"` |  |
+| containerRules.CPUThrottlingHigh.severity | string | `"warning"` |  |
+| containerRules.CPUThrottlingHigh.threshold | int | `65` |  |
 | containerRules.KubeContainerWaiting.for | string | `"1h"` |  |
 | containerRules.KubeContainerWaiting.severity | string | `"warning"` |  |
 | containerRules.KubeDaemonSetMisScheduled.for | string | `"15m"` |  |
@@ -60,7 +63,14 @@ Helm Chart that provisions a series of common Prometheus Alerts
 | containerRules.PodContainerTerminated.threshold | int | `0` |  |
 | containerRules.enabled | bool | `true` | Whether or not to enable the container rules template |
 | defaults.additionalRuleLabels | object | `{}` | Additional custom labels attached to every PrometheusRule |
-| defaults.runbookUrl | string | `"https://github.com/Nextdoor/k8s-charts/tree/main/charts/prometheus-rules/runbook.md"` | The prefix URL to the runbook_urls that will be applied to each PrometheusRule |
+| defaults.runbookUrl | string | `"https://github.com/Nextdoor/k8s-charts/blob/main/charts/prometheus-alerts/runbook.md"` | The prefix URL to the runbook_urls that will be applied to each PrometheusRule |
+| namespaceRules.KubeQuotaAlmostFull.for | string | `"10m"` |  |
+| namespaceRules.KubeQuotaAlmostFull.severity | string | `"warning"` |  |
+| namespaceRules.KubeQuotaAlmostFull.threshold | int | `90` |  |
+| namespaceRules.KubeQuotaFullyUsed.for | string | `"10m"` |  |
+| namespaceRules.KubeQuotaFullyUsed.severity | string | `"critical"` |  |
+| namespaceRules.KubeQuotaFullyUsed.threshold | int | `99` |  |
+| namespaceRules.enabled | bool | `true` | Whether or not to enable the namespace rules template |
 
 ----------------------------------------------
 Autogenerated from chart metadata using [helm-docs v1.4.0](https://github.com/norwoodj/helm-docs/releases/v1.4.0)
diff --git a/charts/prometheus-alerts/runbook.md b/charts/prometheus-alerts/runbook.md
@@ -0,0 +1,67 @@
+## CPUThrottlingHigh
+
+This alert fires if any particular container is experiencing throttling by the
+Linux CFS system. This typically means that your container is operating close
+to its Kubernetes `resource.limits` configuration. You can quickly look at the
+utilization of the individual containers within a given pod or namespace like
+this:
+
+    $ k top pods --containers
+    POD                                      NAME               CPU(cores)   MEMORY(bytes)
+    datadog-agent-2qk9w                      agent              22m          65Mi
+    datadog-agent-2qk9w                      process-agent      10m          35Mi
+    datadog-agent-2qk9w                      system-probe       6m           34Mi
+    datadog-agent-2qk9w                      trace-agent        2m           27Mi
+
+You can compare the actual used CPU and Memory values with the pod through the
+`kubectl describe pod <pod>` command:
+
+    $ k describe pod datadog-agent-2qk9w
+    Name:                 datadog-agent-2qk9w
+    Namespace:            datadog-operator
+    ...
+    Containers:
+      agent:
+      ...
+        Limits:
+          cpu:     25m
+          memory:  256Mi
+        Requests:
+          cpu:      10m
+          memory:   96Mi
+
+In the example above, you can see that the `agent` has a CPU Limit of `25m`,
+but its running at `22m`... so its pretty close to its actual limits. It's
+resource limits should likely be adjusted.
+
+## KubeQuotaAlmostFull
+
+This alert telling you that the resources requested by all of the `Pods` in
+your `Namespace` are close to the `Quota` limits that have been assigned. You
+can inspect any quotas or limits placed on your `Namespace` like this:
+
+    $ kubectl describe namespace my-namespace
+    Name:         my-namespace
+    Status:       Active
+
+    Resource Quotas
+     Name:             default-quotas
+     Resource          Used     Hard
+     --------          ---      ---
+     limits.cpu        10500m   64
+     limits.memory     18816Mi  128Gi
+     requests.cpu      8500m    64
+     requests.memory   16256Mi  128Gi
+     requests.storage  105Gi    512Gi
+
+    Resource Limits
+     Type       Resource  Min  Max   Default Request  Default Limit  Max Limit/Request Ratio
+     ----       --------  ---  ---   ---------------  -------------  -----------------------
+     Container  cpu       -    8     0                0              -
+     Container  memory    -    16Gi  128Mi            128Mi          -
+
+## KubeQuotaFullyUsed
+
+Similar to the `KubeQuotaAlmostFull` alert - but you are now out of resources.
+At this point you cannot launch or scale any new resources until you reduce
+your usage, or work with an administrator to expand your `Quota` capacity.
diff --git a/...ometheus-alerts/templates/containers.yaml → .../templates/containers-prometheusrule.yaml b/...ometheus-alerts/templates/containers.yaml → .../templates/containers-prometheusrule.yaml
@@ -29,6 +29,28 @@ spec:
         {{- end }}
     {{ end -}}
 
+    {{- with .Values.containerRules.CPUThrottlingHigh }}
+    - alert: CPUThrottlingHigh
+      annotations:
+        summary: Processes experience elevated CPU throttling.
+        runbook_url: {{ $values.defaults.runbookUrl }}#CPUThrottlingHigh
+        description: >-
+          {{`{{ $value | humanizePercentage }} throttling of CPU in
+          namespace {{ $labels.namespace }} for container {{ $labels.container
+          }} in pod {{ $labels.pod }}.`}}
+      expr: |-
+        sum(increase(container_cpu_cfs_throttled_periods_total{container!="", namespace=~"{{ $targetNamespace }}"}[5m])) by (container, pod, namespace)
+          /
+        sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
+          > ( {{ .threshold }} / 100 )
+      for: {{ .for }}
+      labels:
+        severity: {{ .severity }}
+        {{- if $values.defaults.additionalRuleLabels }}
+        {{ toYaml $values.defaults.additionalRuleLabels | nindent 8 }}
+        {{- end }}
+    {{- end }}
+
   #
   # Original Source:
   #    https://raw.githubusercontent.com/prometheus-community/helm-charts/kube-prometheus-stack-13.3.0/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-apps.yaml

diff --git a/charts/prometheus-alerts/templates/namespace-prometheusrule.yaml b/charts/prometheus-alerts/templates/namespace-prometheusrule.yaml
@@ -0,0 +1,61 @@
+{{ $values           := .Values }}
+{{ $targetNamespace  := .Release.Namespace }}
+{{ if .Values.namespaceRules.enabled }}
+# Largely copied from
+# https://github.com/prometheus-community/helm-charts/blob/kube-prometheus-stack-14.6.2/charts/kube-prometheus-stack/templates/prometheus/rules-1.14/kubernetes-resources.yaml,
+# but more customizable.
+#
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: {{ .Release.Name }}-namespace-rules
+  annotations:
+    nextdoor.com/chart: {{ .Values.chart_name }}
+    nextdoor.com/source: {{ .Values.chart_source }}
+spec:
+  groups:
+  - name: {{ .Release.Name }}.{{ .Release.Namespace }}.namespaceRules
+    rules:
+
+    {{- with .Values.namespaceRules.KubeQuotaAlmostFull }}
+    - alert: KubeQuotaAlmostFull
+      annotations:
+        summary: Namespace quota is going to be full.
+        runbook_url: {{ $values.defaults.runbookUrl }}#KubeQuotaAlmostFull
+        description: >-
+          {{`Namespace {{ $labels.namespace }} is using {{ $value }} of its {{ $labels.resource }} quota. `}}
+      expr: |-
+        (
+          kube_resourcequota{job="kube-state-metrics", type="used", namespace=~"{{ $targetNamespace }}"}
+            / ignoring(instance, job, type)
+          (kube_resourcequota{job="kube-state-metrics", type="hard", namespace=~"{{ $targetNamespace }}"} > 0)
+        ) * 100 > {{ .threshold }} < 100
+      for: {{ .for }}
+      labels:
+        severity: {{ .severity }}
+        {{- if $values.defaults.additionalRuleLabels }}
+        {{ toYaml $values.defaults.additionalRuleLabels | nindent 8 }}
+        {{- end }}
+    {{- end }}
+
+    {{- with .Values.namespaceRules.KubeQuotaFullyUsed }}
+    - alert: KubeQuotaFullyUsed
+      annotations:
+        summary: Namespace quota is fully used.
+        description:  >-
+          {{`Namespace {{ $labels.namespace }} is using {{ $value }} of its {{ $labels.resource }} quota.`}}
+        runbook_url: {{ $values.defaults.runbookUrl }}#KubeQuotaFullyUsed
+      expr: |-
+        (
+          kube_resourcequota{job="kube-state-metrics", type="used", namespace=~"{{ $targetNamespace }}"}
+            / ignoring(instance, job, type)
+          (kube_resourcequota{job="kube-state-metrics", type="hard", namespace=~"{{ $targetNamespace }}"} > 0)
+        ) * 100 >= {{ .threshold }}
+      for: {{ .for }}
+      labels:
+        severity: {{ .severity }}
+        {{- if $values.defaults.additionalRuleLabels }}
+        {{ toYaml $values.defaults.additionalRuleLabels | nindent 8 }}
+        {{- end }}
+    {{- end }}
+{{- end }}
diff --git a/charts/prometheus-alerts/values.yaml b/charts/prometheus-alerts/values.yaml
@@ -66,7 +66,7 @@ alertManager:
 # Defaults applied to all Prometheus Rules
 defaults:
   # -- The prefix URL to the runbook_urls that will be applied to each PrometheusRule
-  runbookUrl: https://github.com/Nextdoor/k8s-charts/tree/main/charts/prometheus-rules/runbook.md
+  runbookUrl: https://github.com/Nextdoor/k8s-charts/blob/main/charts/prometheus-alerts/runbook.md
   # -- Additional custom labels attached to every PrometheusRule
   additionalRuleLabels: {}
 
@@ -165,3 +165,30 @@ containerRules:
   KubeHpaMaxedOut:
     severity: warning
     for: 15m
+
+  CPUThrottlingHigh:
+    severity: warning
+    threshold: 65
+    for: 15m
+
+# Namespace Alerting Rules
+#
+# These rules provide some basic alerting around namespace limits that may
+# prevent a users workload from scaling up.
+#
+namespaceRules:
+  # -- Whether or not to enable the namespace rules template
+  enabled: true
+
+  # Alerts if any of the resources in a given Namespace are close to the Quotas
+  # assigned to that Namespace.
+  KubeQuotaAlmostFull:
+    severity: warning
+    threshold: 90
+    for: 10m
+
+  # Similar to above - but with a higher threshold and a higher severity.
+  KubeQuotaFullyUsed:
+    severity: critical
+    threshold: 99
+    for: 10m