Skip to content

Commit

Permalink
packer: Enable support for adding nvidia gpu to podvm image
Browse files Browse the repository at this point in the history
NVIDIA GPU support is enabled by default when building packer based
image for aws and azure
A default build of podvm in aws took 10 min

    ```
    ==> Wait completed after 10 minutes 16 seconds

    ==> Builds finished. The artifacts of successful builds are:
    --> peer-pod-ubuntu.amazon-ebs.ubuntu: AMIs were created:
    us-east-2: ami-0463ae5aa8d5b3606

    rm -fr toupload

    real    10m34.352s
    user    0m18.919s
    sys     0m10.044s
    ```

If you want to disable, then run with ENABLE_NVIDIA_GPU=no
For example:
cd azure/image
PODVM_DISTRO=ubuntu ENABLE_NVIDIA_GPU=no make image

This results in the qcow2/setup_addons.sh script executing the addons/nvidia_gpu/setup.sh
to setup NVIDIA drivers, libraries and prestart hook into the podvm image

Signed-off-by: Pradipta Banerjee <pradipta.banerjee@gmail.com>
  • Loading branch information
bpradipt committed Dec 13, 2023
1 parent 81ac36d commit ddbddb7
Show file tree
Hide file tree
Showing 14 changed files with 342 additions and 1 deletion.
6 changes: 5 additions & 1 deletion aws/image/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@ include $(ROOT_DIR)podvm/Makefile.inc

.PHONY: image clean default-vpc

INSTANCE_TYPE ?= t3.small
INSTANCE_TYPE ?= c4.xlarge

VOLUME_SIZE ?= 30

ENABLE_NVIDIA_GPU ?= yes
export ENABLE_NVIDIA_GPU

image: $(IMAGE_FILE)

$(IMAGE_FILE): setopts $(BINARIES) $(FILES)
Expand All @@ -21,6 +24,7 @@ $(IMAGE_FILE): setopts $(BINARIES) $(FILES)
-var instance_type=${INSTANCE_TYPE} \
-var volume_size=${VOLUME_SIZE} \
-var config_script_src=$(ROOT_DIR)/podvm/qcow2 \
-var addons_script_src=$(ROOT_DIR)/podvm/addons \
-var ami_name=${IMAGE_NAME} ./${PODVM_DISTRO}/
rm -fr toupload

Expand Down
38 changes: 38 additions & 0 deletions aws/image/rhel/aws-rhel.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,42 @@ build {
"sudo -E bash ~/misc-settings.sh"
]
}

# Addons
# To avoid multiple conditionals, copying the entire addons directory
# Individual addons are installed based on environment_vars by setup_addons.sh
provisioner "shell-local" {
command = "tar cf toupload/addons.tar -C ../../podvm addons"
}

provisioner "file" {
source = "toupload"
destination = "/tmp/"
}

provisioner "shell" {
inline = [
"cd /tmp && tar xf toupload/addons.tar",
"rm toupload/addons.tar"
]
}

provisioner "file" {
source = "${var.addons_script_src}/setup_addons.sh"
destination = "~/setup_addons.sh"
}

provisioner "shell" {
remote_folder = "~"
environment_vars = [
"CLOUD_PROVIDER=${var.cloud_provider}",
"PODVM_DISTRO=${var.podvm_distro}",
"DISABLE_CLOUD_CONFIG=${var.disable_cloud_config}",
"ENABLE_NVIDIA_GPU=${var.enable_nvidia_gpu}"
]
inline = [
"sudo -E bash ~/setup_addons.sh"
]
}

}
10 changes: 10 additions & 0 deletions aws/image/rhel/variables.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,13 @@ variable "config_script_src" {
type = string
default = ""
}

variable "addons_script_src" {
type = string
default = ""
}

variable "enable_nvidia_gpu" {
type = string
default = env("ENABLE_NVIDIA_GPU")
}
37 changes: 37 additions & 0 deletions aws/image/ubuntu/aws-ubuntu.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,41 @@ build {
"sudo -E bash ~/misc-settings.sh"
]
}

# Addons
# To avoid multiple conditionals, copying the entire addons directory
# Individual addons are installed based on environment_vars by setup_addons.sh
provisioner "shell-local" {
command = "tar cf toupload/addons.tar -C ../../podvm addons"
}

provisioner "file" {
source = "toupload"
destination = "/tmp/"
}

provisioner "shell" {
inline = [
"cd /tmp && tar xf toupload/addons.tar",
"rm toupload/addons.tar"
]
}

provisioner "file" {
source = "${var.addons_script_src}/setup_addons.sh"
destination = "~/setup_addons.sh"
}

provisioner "shell" {
remote_folder = "~"
environment_vars = [
"CLOUD_PROVIDER=${var.cloud_provider}",
"PODVM_DISTRO=${var.podvm_distro}",
"DISABLE_CLOUD_CONFIG=${var.disable_cloud_config}",
"ENABLE_NVIDIA_GPU=${var.enable_nvidia_gpu}"
]
inline = [
"sudo -E bash ~/setup_addons.sh"
]
}
}
9 changes: 9 additions & 0 deletions aws/image/ubuntu/variables.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,12 @@ variable "config_script_src" {
default = ""
}

variable "addons_script_src" {
type = string
default = ""
}

variable "enable_nvidia_gpu" {
type = string
default = env("ENABLE_NVIDIA_GPU")
}
3 changes: 3 additions & 0 deletions azure/image/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ include $(ROOT_DIR)podvm/Makefile.inc

.PHONY: image clean

ENABLE_NVIDIA_GPU ?= yes

image: $(IMAGE_FILE)

$(IMAGE_FILE): $(BINARIES) $(FILES)
Expand All @@ -27,6 +29,7 @@ $(IMAGE_FILE): $(BINARIES) $(FILES)
packer init ./${PODVM_DISTRO}/
packer build \
-var config_script_src=$(ROOT_DIR)/podvm/qcow2 \
-var addons_script_src=$(ROOT_DIR)/podvm/addons \
./${PODVM_DISTRO}/
rm -fr toupload

Expand Down
37 changes: 37 additions & 0 deletions azure/image/rhel/azure-rhel.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,43 @@ build {
]
}

# Addons
# To avoid multiple conditionals, copying the entire addons directory
# Individual addons are installed based on environment_vars by setup_addons.sh
provisioner "shell-local" {
command = "tar cf toupload/addons.tar -C ../../podvm addons"
}

provisioner "file" {
source = "toupload"
destination = "/tmp/"
}

provisioner "shell" {
inline = [
"cd /tmp && tar xf toupload/addons.tar",
"rm toupload/addons.tar"
]
}

provisioner "file" {
source = "${var.addons_script_src}/setup_addons.sh"
destination = "~/setup_addons.sh"
}

provisioner "shell" {
remote_folder = "~"
environment_vars = [
"CLOUD_PROVIDER=${var.cloud_provider}",
"PODVM_DISTRO=${var.podvm_distro}",
"DISABLE_CLOUD_CONFIG=${var.disable_cloud_config}",
"ENABLE_NVIDIA_GPU=${var.enable_nvidia_gpu}"
]
inline = [
"sudo -E bash ~/setup_addons.sh"
]
}

provisioner "shell" {
execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'"
inline = [
Expand Down
10 changes: 10 additions & 0 deletions azure/image/rhel/variables.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,13 @@ variable "config_script_src" {
type = string
default = ""
}

variable "addons_script_src" {
type = string
default = ""
}

variable "enable_nvidia_gpu" {
type = string
default = env("ENABLE_NVIDIA_GPU")
}
38 changes: 38 additions & 0 deletions azure/image/ubuntu/azure-ubuntu.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,44 @@ build {
]
}

# Addons
# To avoid multiple conditionals, copying the entire addons directory
# Individual addons are installed based on environment_vars by setup_addons.sh
provisioner "shell-local" {
command = "tar cf toupload/addons.tar -C ../../podvm addons"
}

provisioner "file" {
source = "toupload"
destination = "/tmp/"
}

provisioner "shell" {
inline = [
"cd /tmp && tar xf toupload/addons.tar",
"rm toupload/addons.tar"
]
}

provisioner "file" {
source = "${var.addons_script_src}/setup_addons.sh"
destination = "~/setup_addons.sh"
}

provisioner "shell" {
remote_folder = "~"
environment_vars = [
"CLOUD_PROVIDER=${var.cloud_provider}",
"PODVM_DISTRO=${var.podvm_distro}",
"DISABLE_CLOUD_CONFIG=${var.disable_cloud_config}",
"ENABLE_NVIDIA_GPU=${var.enable_nvidia_gpu}"
]
inline = [
"sudo -E bash ~/setup_addons.sh"
]
}


provisioner "shell" {
execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'"
inline = [
Expand Down
10 changes: 10 additions & 0 deletions azure/image/ubuntu/variables.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,13 @@ variable "config_script_src" {
type = string
default = ""
}

variable "addons_script_src" {
type = string
default = ""
}

variable "enable_nvidia_gpu" {
type = string
default = env("ENABLE_NVIDIA_GPU")
}
7 changes: 7 additions & 0 deletions podvm/addons/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
## Introduction

The addons directory is used to enable different addons for the podvm image.
Each addon and its associated files (binaries, configuration etc) should be under
specific sub-dir under `addons`.

Each addon sub-dir needs to have `setup.sh` for setting up the addon.
76 changes: 76 additions & 0 deletions podvm/addons/nvidia_gpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
## Introduction

This addon enables nvidia GPU support in the podvm image.

You need to specify the GPU instance types in the cloud-api-adaptor configMap (peer-pods-cm).

Here is an example. Replace it as appropriate depending on the specific provider and region

```
# For AWS
PODVM_INSTANCE_TYPES: "t3.small,c5.xlarge,p3.2xlarge"
# For Azure
AZURE_INSTANCE_SIZES: "Standard_D8as_v5,Standard_D4as_v5,Standard_NC6s_v3,Standard_NC4as_T4_v3"
```

Example pod definition:
```
apiVersion: v1
kind: Pod
metadata:
name: gpu-test
labels:
app: test
annotations:
io.katacontainers.config.hypervisor.machine_type: Standard_NC4as_T4_v3
io.containerd.cri.runtime-handler: kata-remote
spec:
runtimeClassName: kata-remote
containers:
- name: ubuntu
image: ubuntu
command: ["sleep"]
args: ["infinity"]
env:
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
```

You can verify the GPU devices by execing a shell in the pod as shown below:

```
$ kubectl exec -it gpu-test -- bash
root@gpu-test:/# nvidia-smi
Thu Nov 23 17:30:58 2023
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 Tesla T4 Off | 00000001:00:00.0 Off | Off |
| N/A 36C P8 9W / 70W | 2MiB / 16384MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
root@gpu-test:/# nvidia-smi -L
GPU 0: Tesla T4 (UUID: GPU-2b9a9945-a56c-fcf3-7156-8e380cf1d0cc)
root@gpu-test:/# ls -l /dev/nvidia*
crw-rw-rw- 1 root root 235, 0 Nov 23 17:27 /dev/nvidia-uvm
crw-rw-rw- 1 root root 235, 1 Nov 23 17:27 /dev/nvidia-uvm-tools
crw-rw-rw- 1 root root 195, 0 Nov 23 17:27 /dev/nvidia0
crw-rw-rw- 1 root root 195, 255 Nov 23 17:27 /dev/nvidiactl
```
Loading

0 comments on commit ddbddb7

Please sign in to comment.