Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/model-engine/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.10
version: 0.2.0-beta.1

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
53 changes: 52 additions & 1 deletion charts/model-engine/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ env:
value: {{ .Values.aws.profileName }}
- name: S3_WRITE_AWS_PROFILE
value: {{ .Values.aws.s3WriteProfileName }}
{{- else }}
{{- /* On-prem: Do NOT set AWS_PROFILE - boto3 uses default credential chain */ -}}
{{- end }}
{{- with .Values.secrets }}
{{- if .kubernetesDatabaseSecretName }}
Expand Down Expand Up @@ -367,7 +369,13 @@ env:
- name: CELERY_RESULT_BACKEND
value: {{ .Values.celeryResultBackend | quote }}
{{- end }}
{{- if .Values.redis.auth}}
{{- if .Values.redis.authSecretName }}
- name: REDIS_AUTH_TOKEN
valueFrom:
secretKeyRef:
name: {{ .Values.redis.authSecretName }}
key: {{ .Values.redis.authSecretKey | default "auth_token" }}
{{- else if .Values.redis.auth }}
- name: REDIS_AUTH_TOKEN
value: {{ .Values.redis.auth }}
{{- end }}
Expand Down Expand Up @@ -399,6 +407,9 @@ env:
value: {{ .Values.tag }}
- name: GIT_TAG
value: {{ .Values.tag }}
{{- with .Values.extraEnvVars }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- end }}

{{- define "modelEngine.serviceEnvGitTagFromPythonReplace" }}
Expand Down Expand Up @@ -455,6 +466,10 @@ volumes:
- key: infra_service_config
path: config.yaml
{{- end }}
{{- with .Values.extraVolumes }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- include "modelEngine.tokenVolume" . | nindent 2 }}
{{- end }}

{{- define "modelEngine.volumeMounts" }}
Expand All @@ -474,6 +489,10 @@ volumeMounts:
- name: infra-service-config-volume
mountPath: /workspace/model-engine/model_engine_server/core/configs
{{- end }}
{{- with .Values.extraVolumeMounts }}
{{- toYaml . | nindent 2 }}
{{- end }}
{{- include "modelEngine.tokenVolumeMount" . | nindent 2 }}
{{- end }}

{{- define "modelEngine.forwarderVolumeMounts" }}
Expand Down Expand Up @@ -502,3 +521,35 @@ namespaces:
- {{ . }}
{{- end }}
{{- end }}

{{- define "modelEngine.tokenVolume" }}
{{- if not .Values.automountServiceAccountToken }}
- name: token-volume
projected:
defaultMode: 0444
sources:
- serviceAccountToken:
path: token
expirationSeconds: 86400
# We also need to project the CA cert and namespace files
- configMap:
name: kube-root-ca.crt
items:
- key: ca.crt
path: ca.crt
- downwardAPI:
items:
- path: namespace
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
{{- end }}
{{- end }}

{{- define "modelEngine.tokenVolumeMount" }}
{{- if not .Values.automountServiceAccountToken }}
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: token-volume
readOnly: true
{{- end }}
{{- end }}
2 changes: 1 addition & 1 deletion charts/model-engine/templates/balloon_cpu_deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- image: public.ecr.aws/ubuntu/ubuntu:latest
- image: {{ $.Values.utilityImages.ubuntu.repository }}:{{ $.Values.utilityImages.ubuntu.tag }}
imagePullPolicy: IfNotPresent
name: main
resources:
Expand Down
7 changes: 6 additions & 1 deletion charts/model-engine/templates/balloon_deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@ spec:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
{{- range $.Values.gpuTolerations }}
- key: {{ .key | quote }}
operator: "Exists"
effect: "NoSchedule"
{{- end }}
containers:
- image: public.ecr.aws/ubuntu/ubuntu:latest
- image: {{ $.Values.utilityImages.ubuntu.repository }}:{{ $.Values.utilityImages.ubuntu.tag }}
imagePullPolicy: IfNotPresent
name: main
resources:
Expand Down
1 change: 1 addition & 0 deletions charts/model-engine/templates/cacher_deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ spec:
{{- toYaml .Values.resources | nindent 12 }}
{{- include "modelEngine.cacherEnv" . | indent 10 }}
{{- include "modelEngine.volumeMounts" . | indent 10 }}
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
serviceAccountName: {{ include "modelEngine.fullname" . }}
{{- include "modelEngine.volumes" . | indent 6 }}
{{- with .Values.nodeSelector }}
Expand Down
15 changes: 12 additions & 3 deletions charts/model-engine/templates/celery_autoscaler_stateful_set.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
{{ $broker_name = "sqs-message-broker-master" }}
{{- else if eq $message_broker "servicebus" }}
{{ $broker_name = "servicebus-message-broker-master" }}
{{- else if and .Values.config .Values.config.values .Values.config.values.infra (eq (.Values.config.values.infra.cloud_provider | default "") "gcp") }}
{{ $broker_name = "redis-gcp-memorystore-message-broker-master" }}
{{- end }}
apiVersion: apps/v1
kind: StatefulSet
Expand Down Expand Up @@ -86,12 +88,15 @@ spec:
resources:
requests:
cpu: 1000m
{{- if .Values.aws }}
volumeMounts:
{{- if .Values.aws }}
- mountPath: /opt/.aws/config
name: config-volume
subPath: config
{{- end }}
{{- if not $.Values.automountServiceAccountToken }}
{{- include "modelEngine.tokenVolumeMount" $ | nindent 8 }}
{{- end }}
{{ with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand All @@ -101,13 +106,17 @@ spec:
operator: Equal
value: 'true'
effect: NoSchedule
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
serviceAccountName: {{ include "modelEngine.fullname" $ }}
{{- if .Values.aws }}
volumes:
{{- if .Values.aws }}
- configMap:
name: {{ .Values.aws.configMap.name }}
name: config-volume
{{- end}}
{{- end }}
{{- if not .Values.automountServiceAccountToken }}
{{- include "modelEngine.tokenVolume" . | nindent 6 }}
{{- end }}
updateStrategy:
rollingUpdate:
maxUnavailable: 20%
Expand Down
54 changes: 54 additions & 0 deletions charts/model-engine/templates/database_init_job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{{- if .Values.db.runDbInitScript }}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "modelEngine.fullname" . }}-database-setup-{{ .Release.Revision }}
labels:
{{- include "modelEngine.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-1"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hook weight collision with migration job

Both database_init_job.yaml and database_migration_job.yaml use "helm.sh/hook-weight": "-1" on pre-install,pre-upgrade. This means Helm does not guarantee execution order between them. Since init_database.py creates schemas (create schema if not exists) and tables (Base.metadata.create_all), and the migration job applies Alembic migrations that depend on those schemas existing, the migration could run first and fail.

Consider setting this job's hook weight to "-2" (or lower) so it is guaranteed to run before the migration job at weight "-1".

Suggested change
"helm.sh/hook-weight": "-1"
"helm.sh/hook-weight": "-2"
Prompt To Fix With AI
This is a comment left during a code review.
Path: charts/model-engine/templates/database_init_job.yaml
Line: 10

Comment:
**Hook weight collision with migration job**

Both `database_init_job.yaml` and `database_migration_job.yaml` use `"helm.sh/hook-weight": "-1"` on `pre-install,pre-upgrade`. This means Helm does not guarantee execution order between them. Since `init_database.py` creates schemas (`create schema if not exists`) and tables (`Base.metadata.create_all`), and the migration job applies Alembic migrations that depend on those schemas existing, the migration could run first and fail.

Consider setting this job's hook weight to `"-2"` (or lower) so it is guaranteed to run before the migration job at weight `"-1"`.

```suggestion
    "helm.sh/hook-weight": "-2"
```

How can I resolve this? If you propose a fix, please make it concise.

"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
activeDeadlineSeconds: 600
template:
metadata:
labels:
sidecar.istio.io/inject: "false"
{{- include "modelEngine.labels" . | nindent 8 }}
spec:
restartPolicy: Never
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
containers:
- name: {{ include "modelEngine.fullname" . }}
image: "{{ .Values.image.gatewayRepository }}:{{ .Values.tag}}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- dumb-init
- --
args:
- python
- -m
- model_engine_server.entrypoints.init_database
{{- include "modelEngine.serviceEnvGitTagFromHelmVar" . | indent 10 }}
{{- include "modelEngine.volumeMounts" . | indent 10 }}
serviceAccountName: {{ include "modelEngine.fullname" . }}
{{- include "modelEngine.volumes" . | indent 6 }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
5 changes: 3 additions & 2 deletions charts/model-engine/templates/database_migration_job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "modelEngine.fullname" . }}-database-migration
name: {{ include "modelEngine.fullname" . }}-database-migration-{{ .Release.Revision }}
labels:
{{- include "modelEngine.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-1"
"helm.sh/hook-delete-policy": hook-succeeded
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
activeDeadlineSeconds: 600
Expand All @@ -35,6 +35,7 @@ spec:
- /workspace/model-engine/model_engine_server/db/migrations/run_database_migration.sh
{{- include "modelEngine.serviceEnvGitTagFromHelmVar" . | indent 10 }}
{{- include "modelEngine.volumeMounts" . | indent 10 }}
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
serviceAccountName: {{ include "modelEngine.fullname" . }}
{{- include "modelEngine.volumes" . | indent 6 }}
{{- with .Values.nodeSelector }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ spec:
{{- toYaml .Values.resources | nindent 12 }}
{{- include "modelEngine.builderEnv" . | indent 10 }}
{{- include "modelEngine.volumeMounts" . | indent 10 }}
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
serviceAccountName: {{ include "modelEngine.fullname" . }}
{{- include "modelEngine.volumes" . | indent 6 }}
{{- with .Values.nodeSelector }}
Expand Down
1 change: 1 addition & 0 deletions charts/model-engine/templates/gateway_deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ spec:
{{- toYaml .Values.resources | nindent 12 }}
{{- include "modelEngine.gatewayEnv" . | indent 10 }}
{{- include "modelEngine.volumeMounts" . | indent 10 }}
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
serviceAccountName: {{ include "modelEngine.fullname" . }}
{{- include "modelEngine.volumes" . | indent 6 }}
{{- with .Values.nodeSelector }}
Expand Down
14 changes: 7 additions & 7 deletions charts/model-engine/templates/inference_framework_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ metadata:
"helm.sh/hook": pre-install
"helm.sh/hook-weight": "-2"
data:
deepspeed: "latest"
text_generation_inference: "latest"
vllm: "latest"
vllm_batch: "latest"
vllm_batch_v2: "latest"
lightllm: "latest"
tensorrt_llm: "latest"
deepspeed: {{ .Values.inferenceFramework.deepspeed | default "latest" | quote }}
text_generation_inference: {{ .Values.inferenceFramework.text_generation_inference | default "latest" | quote }}
vllm: {{ .Values.inferenceFramework.vllm | default "latest" | quote }}
vllm_batch: {{ .Values.inferenceFramework.vllm_batch | default "latest" | quote }}
vllm_batch_v2: {{ .Values.inferenceFramework.vllm_batch_v2 | default "latest" | quote }}
lightllm: {{ .Values.inferenceFramework.lightllm | default "latest" | quote }}
tensorrt_llm: {{ .Values.inferenceFramework.tensorrt_llm | default "latest" | quote }}
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "modelEngine.fullname" . }}-populate-fine-tuning-repository
name: {{ include "modelEngine.fullname" . }}-populate-fine-tuning-repository-{{ .Release.Revision }}
labels:
{{- include "modelEngine.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": hook-succeeded
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
activeDeadlineSeconds: 600
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,16 @@ spec:
- --nodelabels=k8s.amazonaws.com/accelerator=nvidia-ampere-a100
- --logtostderr=true
- --v=2
{{- if not $.Values.automountServiceAccountToken }}
volumeMounts:
{{- include "modelEngine.tokenVolumeMount" $ | nindent 12 }}
{{- end }}
priorityClassName: system-cluster-critical
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
serviceAccountName: {{ include "modelEngine.fullname" . }}
{{- if not .Values.automountServiceAccountToken }}
volumes:
{{- include "modelEngine.tokenVolume" . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,16 @@ spec:
- --nodelabels=k8s.amazonaws.com/accelerator=nvidia-ampere-a10
- --logtostderr=true
- --v=2
{{- if not $.Values.automountServiceAccountToken }}
volumeMounts:
{{- include "modelEngine.tokenVolumeMount" $ | nindent 12 }}
{{- end }}
priorityClassName: system-cluster-critical
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
serviceAccountName: {{ include "modelEngine.fullname" . }}
{{- if not .Values.automountServiceAccountToken }}
volumes:
{{- include "modelEngine.tokenVolume" . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,16 @@ spec:
- --nodelabels=k8s.amazonaws.com/accelerator=nvidia-tesla-t4
- --logtostderr=true
- --v=2
{{- if not $.Values.automountServiceAccountToken }}
volumeMounts:
{{- include "modelEngine.tokenVolumeMount" $ | nindent 12 }}
{{- end }}
priorityClassName: system-cluster-critical
automountServiceAccountToken: {{ .Values.automountServiceAccountToken }}
serviceAccountName: {{ include "modelEngine.fullname" . }}
{{- if not .Values.automountServiceAccountToken }}
volumes:
{{- include "modelEngine.tokenVolume" . | nindent 8 }}
{{- end }}
{{- end }}
{{- end }}
4 changes: 2 additions & 2 deletions charts/model-engine/templates/restart_keda_operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "modelEngine.fullname" . }}-restart-keda-operator
name: {{ include "modelEngine.fullname" . }}-restart-keda-operator-{{ .Release.Revision }}
labels:
{{- include "modelEngine.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-install
"helm.sh/hook-weight": "1"
"helm.sh/hook-delete-policy": hook-succeeded
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
activeDeadlineSeconds: 600
Expand Down
3 changes: 3 additions & 0 deletions charts/model-engine/templates/service_account.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,8 @@ metadata:
imagePullSecrets:
- name: egp-ecr-regcred
{{- end }}
{{- with $.Values.automountServiceAccountToken }}
automountServiceAccountToken: {{ . }}
{{- end }}
Comment on lines +24 to +26
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with skips block when value is false

Helm's with treats false as falsy, so when automountServiceAccountToken is set to false (the exact scenario where you want to disable automount on the ServiceAccount), this block will be skipped entirely and automountServiceAccountToken won't be rendered into the YAML. Only when the value is true will it appear — which is the opposite of the critical behavior.

In contrast, the pod-level specs in other templates correctly use {{ .Values.automountServiceAccountToken }} directly without a with gate.

Suggested change
{{- with $.Values.automountServiceAccountToken }}
automountServiceAccountToken: {{ . }}
{{- end }}
automountServiceAccountToken: {{ $.Values.automountServiceAccountToken }}
Prompt To Fix With AI
This is a comment left during a code review.
Path: charts/model-engine/templates/service_account.yaml
Line: 24-26

Comment:
**`with` skips block when value is `false`**

Helm's `with` treats `false` as falsy, so when `automountServiceAccountToken` is set to `false` (the exact scenario where you want to disable automount on the ServiceAccount), this block will be skipped entirely and `automountServiceAccountToken` won't be rendered into the YAML. Only when the value is `true` will it appear — which is the opposite of the critical behavior. 

In contrast, the pod-level specs in other templates correctly use `{{ .Values.automountServiceAccountToken }}` directly without a `with` gate.

```suggestion
automountServiceAccountToken: {{ $.Values.automountServiceAccountToken }}
```

How can I resolve this? If you propose a fix, please make it concise.

---
{{- end }}
4 changes: 4 additions & 0 deletions charts/model-engine/templates/service_account_inference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@ metadata:
annotations:
{{- toYaml . | nindent 4 }}
{{- if $.Values.azure }}
{{- if $.Values.azure.inference_client_id }}
azure.workload.identity/client-id: {{ $.Values.azure.inference_client_id }}
{{- else }}
azure.workload.identity/client-id: {{ $.Values.azure.client_id }}
{{- end }}
{{- end }}
{{- end }}
{{- if $.Values.azure }}
imagePullSecrets:
Expand Down
Loading