From c378f1936de54feb4cd21525a374617066500686 Mon Sep 17 00:00:00 2001 From: Albert Zhang Date: Thu, 16 Sep 2021 15:11:32 -0400 Subject: [PATCH] add job restarts limit to values (#54) --- charts/flink-job/Chart.yaml | 2 +- charts/flink-job/README.md | 7 ++++--- charts/flink-job/templates/prometheusrule.yaml | 2 +- charts/flink-job/values.yaml | 5 ++++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/charts/flink-job/Chart.yaml b/charts/flink-job/Chart.yaml index d7a3e18c..169ac8aa 100644 --- a/charts/flink-job/Chart.yaml +++ b/charts/flink-job/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 appVersion: "1.0" description: Flink job cluster on k8s name: flink-job -version: 0.0.4 +version: 0.0.5 maintainers: - name: Zedive email: albert@nextdoor.com diff --git a/charts/flink-job/README.md b/charts/flink-job/README.md index 1d1b097d..cf59e6a8 100644 --- a/charts/flink-job/README.md +++ b/charts/flink-job/README.md @@ -2,7 +2,7 @@ Flink job cluster on k8s -![Version: 0.0.4](https://img.shields.io/badge/Version-0.0.4-informational?style=flat-square) ![AppVersion: 1.0](https://img.shields.io/badge/AppVersion-1.0-informational?style=flat-square) +![Version: 0.0.5](https://img.shields.io/badge/Version-0.0.5-informational?style=flat-square) ![AppVersion: 1.0](https://img.shields.io/badge/AppVersion-1.0-informational?style=flat-square) This chart deploys a flink job cluster and runs a simple word counting flink app as an example. This chart includes some production ready set-ups such as @@ -19,8 +19,9 @@ See metrics reporter in the flink properties for more details. | Key | Type | Default | Description | |-----|------|---------|-------------| -| alerts.enabled | bool | `true` | (Boolean) whether to create the PrometheusRule for this flink cluster | -| alerts.severity | string | `"info"` | | +| alerts.enabled | bool | `true` | (Boolean) Specifies whether to create the PrometheusRule for this flink cluster | +| alerts.restartsLimit | int | `2` | (`int`) The number of job restarts before alerting | +| alerts.severity | string | `"info"` | (String) Severity of the alerts | | defaults.runbookUrl | string | `"https://github.com/Nextdoor/k8s-charts/blob/main/charts/flink-job/runbook.md"` | (String) Runbook URL for the Prometheus alerts | | envVars | list | `[{"name":"HADOOP_CLASSPATH","value":"/opt/flink/opt/flink-metrics-prometheus-1.9.3.jar"}]` | Environment variables shared by all containers | | flinkProperties | object | `{"execution.checkpointing.interval":"10min","execution.checkpointing.mode":"EXACTLY_ONCE","high-availability":"org.apache.flink.kubernetes.highavailability.KubernetesHaServicesFactory","high-availability.storageDir":"file:/savepoint/","kubernetes.cluster-id":"{{ .Values.fullnameOverride }}","kubernetes.namespace":"{{ .Release.Namespace }}","metrics.reporter.prom.class":"org.apache.flink.metrics.prometheus.PrometheusReporter","metrics.reporters":"prom","restart-strategy":"exponential-delay","restart-strategy.exponential-delay.backoff-multiplier":"2.0","state.checkpoints.dir":"file:/savepoint/","taskmanager.numberOfTaskSlots":"1"}` | (`Map`) Flink properties which are appened to flink-conf.yaml | diff --git a/charts/flink-job/templates/prometheusrule.yaml b/charts/flink-job/templates/prometheusrule.yaml index cbf56623..a8736def 100644 --- a/charts/flink-job/templates/prometheusrule.yaml +++ b/charts/flink-job/templates/prometheusrule.yaml @@ -47,7 +47,7 @@ spec: changes(flink_jobmanager_job_numRestarts{ cluster="{{ $cluster }}", namespace="{{ $namespace }}" - }[30m]) > 2 + }[30m]) > {{ .Values.alerts.restartsLimit }} for: 10m labels: severity: {{ .Values.alerts.severity }} diff --git a/charts/flink-job/values.yaml b/charts/flink-job/values.yaml index e91e9041..ac564f81 100644 --- a/charts/flink-job/values.yaml +++ b/charts/flink-job/values.yaml @@ -199,9 +199,12 @@ savepoints: enabled: true alerts: - # -- (Boolean) whether to create the PrometheusRule for this flink cluster + # -- (Boolean) Specifies whether to create the PrometheusRule for this flink cluster enabled: true + # -- (String) Severity of the alerts severity: info + # -- (`int`) The number of job restarts before alerting + restartsLimit: 2 defaults: # -- (String) Runbook URL for the Prometheus alerts