Skip to content

Commit

Permalink
Update gojsontoyaml release and generate manifests (#2043)
Browse files Browse the repository at this point in the history
* dep: Bump to release v0.1.0 of gojsontoyaml

* make: Generate
  • Loading branch information
philipgough authored Mar 15, 2023
1 parent 274d585 commit f6f3682
Show file tree
Hide file tree
Showing 18 changed files with 4,115 additions and 14,117 deletions.
44 changes: 13 additions & 31 deletions manifests/alertmanager-prometheusRule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ spec:
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{
$labels.pod}}.
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
Expand All @@ -30,11 +29,9 @@ spec:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only
found {{ $value }} members of the {{$labels.job}} cluster.
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster
members.
summary: A member of an Alertmanager cluster has not found all other cluster members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
Expand All @@ -46,9 +43,7 @@ spec:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed
to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration
}}.
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
Expand All @@ -63,12 +58,9 @@ spec:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a critical integration.
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
Expand All @@ -81,12 +73,9 @@ spec:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration
}} sent from any instance in the {{$labels.job}} cluster is {{ $value |
humanizePercentage }}.
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications
to a non-critical integration.
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
Expand All @@ -99,8 +88,7 @@ spec:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{$labels.job}} cluster have
different configurations.
description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
Expand All @@ -113,12 +101,9 @@ spec:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have been up for less than half of the
last 5m.'
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster
are down.
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |
(
count by (namespace,service) (
Expand All @@ -135,12 +120,9 @@ spec:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ $value | humanizePercentage }} of Alertmanager instances
within the {{$labels.job}} cluster have restarted at least 5 times in the
last 10m.'
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster
are crashlooping.
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |
(
count by (namespace,service) (
Expand Down
3 changes: 1 addition & 2 deletions manifests/grafana-prometheusRule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ spec:
rules:
- alert: GrafanaRequestsFailing
annotations:
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }}
is experiencing {{ $value | humanize }}% errors'
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing
expr: |
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
Expand Down
25 changes: 8 additions & 17 deletions manifests/kubePrometheus-prometheusRule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,10 @@ spec:
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
namespace, service)) > 10
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
for: 10m
labels:
severity: warning
Expand All @@ -33,8 +31,7 @@ spec:
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager
is working properly.
summary: An alert that should always be firing to certify that Alertmanager is working properly.
expr: vector(1)
labels:
severity: none
Expand All @@ -49,16 +46,14 @@ spec:
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname !=
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: Network interface "{{ $labels.device }}" changing its up status
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
summary: Network interface is often changing its status
expr: |
Expand All @@ -68,21 +63,17 @@ spec:
severity: warning
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
BY (instance)
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance,
cpu))
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
Expand Down
15 changes: 4 additions & 11 deletions manifests/kubeStateMetrics-prometheusRule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ spec:
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
Expand All @@ -31,9 +29,7 @@ spec:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in watch operations. This is likely causing it to not be able to expose
metrics about Kubernetes objects correctly or at all.
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
Expand All @@ -46,9 +42,7 @@ spec:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards
configuration, some Kubernetes objects may be exposed multiple times or
not exposed at all.
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: |
Expand All @@ -58,8 +52,7 @@ spec:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects
are not being exposed.
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |
Expand Down
Loading

0 comments on commit f6f3682

Please sign in to comment.