Skip to content

Commit adb1c75

Browse files
committed
Change nvidia device plugin to tegra strategy
1 parent 8b78387 commit adb1c75

File tree

4 files changed

+69
-5
lines changed

4 files changed

+69
-5
lines changed

helm/nvidia-device-plugin/templates/daemonset.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ spec:
3737
value: "true"
3838
- name: FAIL_ON_INIT_ERROR
3939
value: "false"
40+
{{- if .Values.env }}
41+
{{- toYaml .Values.env | nindent 8 }}
42+
{{- else }}
4043
- name: DEVICE_LIST_STRATEGY
4144
value: envvar
4245
- name: DEVICE_ID_STRATEGY
@@ -45,6 +48,7 @@ spec:
4548
value: all
4649
- name: NVIDIA_DRIVER_CAPABILITIES
4750
value: "compute,utility"
51+
{{- end }}
4852
volumes:
4953
{{- toYaml .Values.volumes | nindent 8 }}
5054
hostNetwork: true

helm/nvidia-device-plugin/values.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,26 @@ resources:
2020
securityContext:
2121
privileged: true
2222

23+
# Tolerations for GPU nodes
24+
tolerations:
25+
- key: gpu
26+
operator: Equal
27+
value: "true"
28+
effect: NoExecute
29+
30+
# Environment variables for nvidia-device-plugin
31+
env:
32+
- name: DEVICE_DISCOVERY_STRATEGY
33+
value: "tegra" # Use tegra discovery strategy
34+
- name: DEVICE_LIST_STRATEGY
35+
value: "envvar" # Use envvar for device list strategy
36+
- name: DEVICE_ID_STRATEGY
37+
value: "uuid"
38+
- name: NVIDIA_VISIBLE_DEVICES
39+
value: "all"
40+
- name: NVIDIA_DRIVER_CAPABILITIES
41+
value: "compute,utility"
42+
2343
# Mount the NVIDIA libraries and device files
2444
volumeMounts:
2545
- name: device-plugin

helm/vllm/templates/deployment.yaml

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ spec:
1515
labels:
1616
{{- include "vllm.selectorLabels" . | nindent 8 }}
1717
spec:
18+
{{- if .Values.runtimeClassName }}
19+
runtimeClassName: {{ .Values.runtimeClassName }}
20+
{{- end }}
1821
{{- with .Values.nodeSelector }}
1922
nodeSelector:
2023
{{- toYaml . | nindent 8 }}
@@ -46,6 +49,8 @@ spec:
4649
- "{{ .Values.service.targetPort }}"
4750
- "--tensor-parallel-size"
4851
- "{{ .Values.model.tensorParallelSize }}"
52+
- "--device"
53+
- "cuda"
4954
{{- if .Values.persistence.enabled }}
5055
- "--download-dir"
5156
- "{{ .Values.model.downloadDir }}"
@@ -54,11 +59,14 @@ spec:
5459
{{- toYaml .Values.env | nindent 8 }}
5560
resources:
5661
{{- toYaml .Values.resources | nindent 10 }}
57-
{{- if .Values.persistence.enabled }}
5862
volumeMounts:
63+
{{- if .Values.persistence.enabled }}
5964
- name: model-storage
6065
mountPath: {{ .Values.model.downloadDir }}
6166
{{- end }}
67+
{{- if .Values.volumeMounts }}
68+
{{- toYaml .Values.volumeMounts | nindent 8 }}
69+
{{- end }}
6270
readinessProbe:
6371
httpGet:
6472
path: /health
@@ -75,9 +83,12 @@ spec:
7583
periodSeconds: 30
7684
timeoutSeconds: 10
7785
failureThreshold: 3
78-
{{- if .Values.persistence.enabled }}
7986
volumes:
87+
{{- if .Values.persistence.enabled }}
8088
- name: model-storage
8189
persistentVolumeClaim:
8290
claimName: {{ include "vllm.fullname" . }}-models
91+
{{- end }}
92+
{{- if .Values.volumes }}
93+
{{- toYaml .Values.volumes | nindent 6 }}
8394
{{- end }}

helm/vllm/values.yaml

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
image:
22
repository: vllm/vllm-openai
3-
tag: latest
3+
tag: v0.6.3
44
pullPolicy: Always
55

66
replicaCount: 1
@@ -57,11 +57,40 @@ persistence:
5757
# Environment variables
5858
env:
5959
- name: NVIDIA_VISIBLE_DEVICES
60-
value: "all"
60+
value: "0"
61+
- name: CUDA_VISIBLE_DEVICES
62+
value: "0"
6163
- name: NVIDIA_DRIVER_CAPABILITIES
6264
value: "compute,utility"
65+
- name: VLLM_LOGGING_LEVEL
66+
value: "DEBUG"
67+
- name: CUDA_HOME
68+
value: "/usr/local/cuda"
69+
- name: LD_LIBRARY_PATH
70+
value: "/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64"
6371

6472
# Security context
6573
securityContext:
6674
runAsNonRoot: false
67-
runAsUser: 0
75+
runAsUser: 0
76+
77+
# Runtime class for NVIDIA GPU support (commented out for k3s compatibility)
78+
# runtimeClassName: nvidia
79+
80+
# Volume mounts for NVIDIA libraries
81+
volumeMounts:
82+
- name: nvidia-libs
83+
mountPath: /usr/lib/x86_64-linux-gnu
84+
readOnly: true
85+
- name: nvidia-driver
86+
mountPath: /usr/local/nvidia
87+
readOnly: true
88+
89+
# Volumes for NVIDIA libraries
90+
volumes:
91+
- name: nvidia-libs
92+
hostPath:
93+
path: /usr/lib/x86_64-linux-gnu
94+
- name: nvidia-driver
95+
hostPath:
96+
path: /usr/local/nvidia

0 commit comments

Comments
 (0)