Add support for external hpa metrics in workers

iterative · Mar 6, 2024 · 60239cd · 60239cd
1 parent 7156e58
commit 60239cd
Show file tree

Hide file tree

Showing 5 changed files with 107 additions and 10 deletions.
diff --git a/charts/studio/Chart.yaml b/charts/studio/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: studio
 description: A Helm chart for Kubernetes
 type: application
-version: 0.10.15
+version: 0.11.0
 appVersion: "v2.92.0"
 maintainers:
   - name: iterative

diff --git a/charts/studio/README.md b/charts/studio/README.md
@@ -1,6 +1,6 @@
 # studio
 
-![Version: 0.10.15](https://img.shields.io/badge/Version-0.10.15-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v2.92.0](https://img.shields.io/badge/AppVersion-v2.92.0-informational?style=flat-square)
+![Version: 0.11.0](https://img.shields.io/badge/Version-0.11.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v2.92.0](https://img.shields.io/badge/AppVersion-v2.92.0-informational?style=flat-square)
 
 A Helm chart for Kubernetes
 
@@ -136,13 +136,16 @@ A Helm chart for Kubernetes
 | studioBlobvault.image | object | `{"repository":"nginx","tag":"1.25.1-alpine"}` | Image to use for the blobvault service |
 | studioBlobvault.image.repository | string | `"nginx"` | Image repository |
 | studioBlobvault.image.tag | string | `"1.25.1-alpine"` | Image tag |
-| studioDvcxWorker | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":5,"minReplicas":1,"targetCPUUtilizationPercentage":80},"envFromSecret":"","envVars":{},"ephemeralStorage":{"persistentVolumeClaim":{"claimName":"dvcx-worker","storageClass":""},"size":"20Gi","type":"ephemeral"},"image":{"pullPolicy":"IfNotPresent","repository":"docker.iterative.ai/studio-dvcx-worker"},"logLevel":"info","nodeSelector":{},"podAnnotations":{},"podSecurityContext":{},"replicaCount":1,"resources":{"limits":{"ephemeral-storage":"20Gi","memory":"16Gi"},"requests":{"cpu":"1000m","ephemeral-storage":"10Gi","memory":"3Gi"}},"securityContext":{},"serviceAccount":{"annotations":{},"create":false,"name":""},"strategy":{"rollingUpdate":{"maxSurge":1,"maxUnavailable":0}},"tolerations":[]}` | Studio DVCx Worker settings group |
+| studioDvcxWorker | object | `{"affinity":{},"autoscaling":{"annotations":{},"behavior":{},"enabled":false,"maxReplicas":5,"minReplicas":1,"targetCPUUtilizationPercentage":80,"template":[]},"envFromSecret":"","envVars":{},"ephemeralStorage":{"persistentVolumeClaim":{"claimName":"dvcx-worker","storageClass":""},"size":"20Gi","type":"ephemeral"},"image":{"pullPolicy":"IfNotPresent","repository":"docker.iterative.ai/studio-dvcx-worker"},"logLevel":"info","nodeSelector":{},"podAnnotations":{},"podSecurityContext":{},"replicaCount":1,"resources":{"limits":{"ephemeral-storage":"20Gi","memory":"16Gi"},"requests":{"cpu":"1000m","ephemeral-storage":"10Gi","memory":"3Gi"}},"securityContext":{},"serviceAccount":{"annotations":{},"create":false,"name":""},"strategy":{"rollingUpdate":{"maxSurge":1,"maxUnavailable":0}},"tolerations":[]}` | Studio DVCx Worker settings group |
 | studioDvcxWorker.affinity | object | `{}` | DVCx worker pod affinity configuration |
-| studioDvcxWorker.autoscaling | object | `{"enabled":false,"maxReplicas":5,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | DVCx worker autoscaling configuration |
+| studioDvcxWorker.autoscaling | object | `{"annotations":{},"behavior":{},"enabled":false,"maxReplicas":5,"minReplicas":1,"targetCPUUtilizationPercentage":80,"template":[]}` | DVCx worker autoscaling configuration |
+| studioDvcxWorker.autoscaling.annotations | object | `{}` | Worker autoscaling annotation |
+| studioDvcxWorker.autoscaling.behavior | object | `{}` | DVCx worker autoscaling behavior |
 | studioDvcxWorker.autoscaling.enabled | bool | `false` | DVCx worker autoscaling enabled flag |
 | studioDvcxWorker.autoscaling.maxReplicas | int | `5` | DVCx worker autoscaling max replicas |
 | studioDvcxWorker.autoscaling.minReplicas | int | `1` | DVCx worker autoscaling min replicas |
 | studioDvcxWorker.autoscaling.targetCPUUtilizationPercentage | int | `80` | DVCx worker autoscaling target CPU utilization percentage |
+| studioDvcxWorker.autoscaling.template | list | `[]` | DVCx worker Custom or additional autoscaling metrics Custom or additional autoscaling metrics ref: https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#support-for-custom-metrics |
 | studioDvcxWorker.envFromSecret | string | `""` | The name of an existing Secret that contains sensitive environment variables passed to DVCx worker pods. |
 | studioDvcxWorker.envVars | object | `{}` | Additional environment variables for DVCx worker pods |
 | studioDvcxWorker.ephemeralStorage | object | `{"persistentVolumeClaim":{"claimName":"dvcx-worker","storageClass":""},"size":"20Gi","type":"ephemeral"}` | Ephemeral storage configuration |
@@ -167,13 +170,15 @@ A Helm chart for Kubernetes
 | studioUi | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":5,"minReplicas":1,"targetCPUUtilizationPercentage":80},"envFromSecret":"","envVars":{},"image":{"pullPolicy":"IfNotPresent","repository":"docker.iterative.ai/studio-frontend"},"nodeSelector":{},"podAnnotations":{},"podSecurityContext":{},"replicaCount":1,"resources":{"limits":{"memory":"2Gi"},"requests":{"cpu":"500m","memory":"1Gi"}},"securityContext":{},"service":{"port":3000,"type":"ClusterIP"},"strategy":{"rollingUpdate":{"maxSurge":1,"maxUnavailable":0}},"tolerations":[]}` | Studio UI settings group |
 | studioUi.envFromSecret | string | `""` | The name of an existing Secret that contains sensitive environment variables passed to UI pods. |
 | studioUi.envVars | object | `{}` | Additional environment variables for ui pods |
-| studioWorker | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":5,"minReplicas":1,"targetCPUUtilizationPercentage":80},"envFromSecret":"","envVars":{},"image":{"pullPolicy":"IfNotPresent","repository":"docker.iterative.ai/studio-backend"},"logLevel":"info","nodeSelector":{},"podAnnotations":{},"podSecurityContext":{},"replicaCount":1,"resources":{"limits":{"memory":"1Gi"},"requests":{"cpu":"500m","memory":"512Mi"}},"securityContext":{},"serviceAccount":{"annotations":{},"create":false,"name":""},"strategy":{"rollingUpdate":{"maxSurge":1,"maxUnavailable":0}},"tolerations":[]}` | Studio worker settings group |
+| studioWorker | object | `{"affinity":{},"autoscaling":{"annotations":{},"behavior":{},"enabled":false,"maxReplicas":5,"minReplicas":1,"template":[]},"envFromSecret":"","envVars":{},"image":{"pullPolicy":"IfNotPresent","repository":"docker.iterative.ai/studio-backend"},"logLevel":"info","nodeSelector":{},"podAnnotations":{},"podSecurityContext":{},"replicaCount":1,"resources":{"limits":{"memory":"1Gi"},"requests":{"cpu":"500m","memory":"512Mi"}},"securityContext":{},"serviceAccount":{"annotations":{},"create":false,"name":""},"strategy":{"rollingUpdate":{"maxSurge":1,"maxUnavailable":0}},"tolerations":[]}` | Studio worker settings group |
 | studioWorker.affinity | object | `{}` | Worker affinity |
-| studioWorker.autoscaling | object | `{"enabled":false,"maxReplicas":5,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Worker autoscaling configuration |
+| studioWorker.autoscaling | object | `{"annotations":{},"behavior":{},"enabled":false,"maxReplicas":5,"minReplicas":1,"template":[]}` | Worker autoscaling configuration |
+| studioWorker.autoscaling.annotations | object | `{}` | Worker autoscaling annotation |
+| studioWorker.autoscaling.behavior | object | `{}` | Worker autoscaling behavior |
 | studioWorker.autoscaling.enabled | bool | `false` | Worker autoscaling enabled flag |
 | studioWorker.autoscaling.maxReplicas | int | `5` | Worker autoscaling maximum replicas |
 | studioWorker.autoscaling.minReplicas | int | `1` | Worker autoscaling minimum replicas |
-| studioWorker.autoscaling.targetCPUUtilizationPercentage | int | `80` | Worker autoscaling target CPU utilization percentage |
+| studioWorker.autoscaling.template | list | `[]` | Worker Custom or additional autoscaling metrics Custom or additional autoscaling metrics ref: https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#support-for-custom-metrics |
 | studioWorker.envFromSecret | string | `""` | The name of an existing Secret that contains sensitive environment variables passed to worker pods. |
 | studioWorker.envVars | object | `{}` | Additional environment variables for worker pods |
 | studioWorker.image | object | `{"pullPolicy":"IfNotPresent","repository":"docker.iterative.ai/studio-backend"}` | Studio worker image settings |
@@ -191,4 +196,4 @@ A Helm chart for Kubernetes
 | studioWorker.tolerations | list | `[]` | Worker tolerations |
 
 ----------------------------------------------
-Autogenerated from chart metadata using [helm-docs v1.13.0](https://github.com/norwoodj/helm-docs/releases/v1.13.0)
+Autogenerated from chart metadata using [helm-docs v1.13.1](https://github.com/norwoodj/helm-docs/releases/v1.13.1)
diff --git a/charts/studio/templates/hpa-studio-dvcx-worker.yaml b/charts/studio/templates/hpa-studio-dvcx-worker.yaml
@@ -5,6 +5,9 @@ metadata:
   name: {{.Release.Name}}-dvcx-worker
   labels:
     {{- include "studio-dvcx-worker.labels" . | nindent 4 }}
+  {{- with .Values.studioDvcxWorker.autoscaling.annotations }}
+  annotations: {{ toYaml . | nindent 4 }}
+  {{- end }}
 spec:
   scaleTargetRef:
     apiVersion: apps/v1
@@ -34,4 +37,11 @@ spec:
           type: Utilization
           averageUtilization: {{ .Values.studioDvcxWorker.autoscaling.targetCPUUtilizationPercentage }}
     {{- end }}
+    {{- with .Values.studioDvcxWorker.autoscaling.template }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+  {{- with .Values.studioDvcxWorker.autoscaling.behavior }}
+  behavior:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
 {{- end }}
diff --git a/charts/studio/templates/hpa-studio-worker.yaml b/charts/studio/templates/hpa-studio-worker.yaml
@@ -2,9 +2,12 @@
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
-  name: studio-worker
+  name: {{.Release.Name}}-studio-worker
   labels:
     {{- include "studio-worker.labels" . | nindent 4 }}
+  {{- with .Values.studioWorker.autoscaling.annotations }}
+  annotations: {{ toYaml . | nindent 4 }}
+  {{- end }}
 spec:
   scaleTargetRef:
     apiVersion: apps/v1
@@ -29,4 +32,11 @@ spec:
           type: Utilization
           averageUtilization: {{ .Values.studioWorker.autoscaling.targetCPUUtilizationPercentage }}
     {{- end }}
+    {{- with .Values.studioWorker.autoscaling.template }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+  {{- with .Values.studioWorker.autoscaling.behavior }}
+  behavior:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
 {{- end }}
diff --git a/charts/studio/values.yaml b/charts/studio/values.yaml
@@ -576,10 +576,46 @@ studioWorker:
     # -- Worker autoscaling maximum replicas
     maxReplicas: 5
     # -- Worker autoscaling target CPU utilization percentage
-    targetCPUUtilizationPercentage: 80
+    # targetCPUUtilizationPercentage: 80
     # -- Worker autoscaling target memory utilization percentage
     # targetMemoryUtilizationPercentage: 80
 
+    # -- Worker autoscaling annotation
+    annotations: {}
+
+    # -- Worker Custom or additional autoscaling metrics
+    # Custom or additional autoscaling metrics
+    # ref: https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#support-for-custom-metrics
+    template: []
+    # - type: External
+    #   external:
+    #     metric:
+    #       name: celery-queue-length
+    #       selector:
+    #         matchLabels:
+    #           type: prometheus
+    #     target:
+    #       type: Value
+    #       value: "1"
+
+    # -- Worker autoscaling behavior
+    behavior: {}
+    # scaleUp:
+    #   stabilizationWindowSeconds: 15
+    #   policies:
+    #     - type: Percent
+    #       value: 100
+    #       periodSeconds: 1
+    #     - type: Pods
+    #       value: 2
+    #       periodSeconds: 1
+    # scaleDown:
+    #   stabilizationWindowSeconds: 120
+    #   policies:
+    #     - type: Pods
+    #       value: 1
+    #       periodSeconds: 60
+
   # -- Additional worker pod annotations
   podAnnotations: {}
 
@@ -684,6 +720,42 @@ studioDvcxWorker:
     # -- DVCx worker autoscaling target memory utilization percentage
     # targetMemoryUtilizationPercentage: 80
 
+    # -- Worker autoscaling annotation
+    annotations: {}
+
+    # -- DVCx worker Custom or additional autoscaling metrics
+    # Custom or additional autoscaling metrics
+    # ref: https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#support-for-custom-metrics
+    template: []
+    # - type: External
+    #   external:
+    #     metric:
+    #       name: celery-queue-length
+    #       selector:
+    #         matchLabels:
+    #           type: prometheus
+    #     target:
+    #       type: Value
+    #       value: "1"
+
+    # -- DVCx worker autoscaling behavior
+    behavior: {}
+    # scaleUp:
+    #   stabilizationWindowSeconds: 15
+    #   policies:
+    #     - type: Percent
+    #       value: 100
+    #       periodSeconds: 1
+    #     - type: Pods
+    #       value: 2
+    #       periodSeconds: 1
+    # scaleDown:
+    #   stabilizationWindowSeconds: 120
+    #   policies:
+    #     - type: Pods
+    #       value: 1
+    #       periodSeconds: 60
+
   # -- Additional DVCx worker pod annotations
   podAnnotations: {}