From ddbddb753d463b27f0515de23f554e39c585a3ea Mon Sep 17 00:00:00 2001 From: Pradipta Banerjee Date: Wed, 13 Dec 2023 09:35:53 +0530 Subject: [PATCH] packer: Enable support for adding nvidia gpu to podvm image NVIDIA GPU support is enabled by default when building packer based image for aws and azure A default build of podvm in aws took 10 min ``` ==> Wait completed after 10 minutes 16 seconds ==> Builds finished. The artifacts of successful builds are: --> peer-pod-ubuntu.amazon-ebs.ubuntu: AMIs were created: us-east-2: ami-0463ae5aa8d5b3606 rm -fr toupload real 10m34.352s user 0m18.919s sys 0m10.044s ``` If you want to disable, then run with ENABLE_NVIDIA_GPU=no For example: cd azure/image PODVM_DISTRO=ubuntu ENABLE_NVIDIA_GPU=no make image This results in the qcow2/setup_addons.sh script executing the addons/nvidia_gpu/setup.sh to setup NVIDIA drivers, libraries and prestart hook into the podvm image Signed-off-by: Pradipta Banerjee --- aws/image/Makefile | 6 +- aws/image/rhel/aws-rhel.pkr.hcl | 38 +++++++++++++ aws/image/rhel/variables.pkr.hcl | 10 ++++ aws/image/ubuntu/aws-ubuntu.pkr.hcl | 37 ++++++++++++ aws/image/ubuntu/variables.pkr.hcl | 9 +++ azure/image/Makefile | 3 + azure/image/rhel/azure-rhel.pkr.hcl | 37 ++++++++++++ azure/image/rhel/variables.pkr.hcl | 10 ++++ azure/image/ubuntu/azure-ubuntu.pkr.hcl | 38 +++++++++++++ azure/image/ubuntu/variables.pkr.hcl | 10 ++++ podvm/addons/README.md | 7 +++ podvm/addons/nvidia_gpu/README.md | 76 +++++++++++++++++++++++++ podvm/addons/nvidia_gpu/setup.sh | 50 ++++++++++++++++ podvm/addons/setup_addons.sh | 12 ++++ 14 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 podvm/addons/README.md create mode 100644 podvm/addons/nvidia_gpu/README.md create mode 100755 podvm/addons/nvidia_gpu/setup.sh create mode 100755 podvm/addons/setup_addons.sh diff --git a/aws/image/Makefile b/aws/image/Makefile index 0f8374d35..97e40f8ea 100644 --- a/aws/image/Makefile +++ b/aws/image/Makefile @@ -7,10 +7,13 @@ include $(ROOT_DIR)podvm/Makefile.inc .PHONY: image clean default-vpc -INSTANCE_TYPE ?= t3.small +INSTANCE_TYPE ?= c4.xlarge VOLUME_SIZE ?= 30 +ENABLE_NVIDIA_GPU ?= yes +export ENABLE_NVIDIA_GPU + image: $(IMAGE_FILE) $(IMAGE_FILE): setopts $(BINARIES) $(FILES) @@ -21,6 +24,7 @@ $(IMAGE_FILE): setopts $(BINARIES) $(FILES) -var instance_type=${INSTANCE_TYPE} \ -var volume_size=${VOLUME_SIZE} \ -var config_script_src=$(ROOT_DIR)/podvm/qcow2 \ + -var addons_script_src=$(ROOT_DIR)/podvm/addons \ -var ami_name=${IMAGE_NAME} ./${PODVM_DISTRO}/ rm -fr toupload diff --git a/aws/image/rhel/aws-rhel.pkr.hcl b/aws/image/rhel/aws-rhel.pkr.hcl index 335c22dab..319252022 100644 --- a/aws/image/rhel/aws-rhel.pkr.hcl +++ b/aws/image/rhel/aws-rhel.pkr.hcl @@ -97,4 +97,42 @@ build { "sudo -E bash ~/misc-settings.sh" ] } + + # Addons + # To avoid multiple conditionals, copying the entire addons directory + # Individual addons are installed based on environment_vars by setup_addons.sh + provisioner "shell-local" { + command = "tar cf toupload/addons.tar -C ../../podvm addons" + } + + provisioner "file" { + source = "toupload" + destination = "/tmp/" + } + + provisioner "shell" { + inline = [ + "cd /tmp && tar xf toupload/addons.tar", + "rm toupload/addons.tar" + ] + } + + provisioner "file" { + source = "${var.addons_script_src}/setup_addons.sh" + destination = "~/setup_addons.sh" + } + + provisioner "shell" { + remote_folder = "~" + environment_vars = [ + "CLOUD_PROVIDER=${var.cloud_provider}", + "PODVM_DISTRO=${var.podvm_distro}", + "DISABLE_CLOUD_CONFIG=${var.disable_cloud_config}", + "ENABLE_NVIDIA_GPU=${var.enable_nvidia_gpu}" + ] + inline = [ + "sudo -E bash ~/setup_addons.sh" + ] + } + } diff --git a/aws/image/rhel/variables.pkr.hcl b/aws/image/rhel/variables.pkr.hcl index 202f569c8..d4e52bb4d 100644 --- a/aws/image/rhel/variables.pkr.hcl +++ b/aws/image/rhel/variables.pkr.hcl @@ -52,3 +52,13 @@ variable "config_script_src" { type = string default = "" } + +variable "addons_script_src" { + type = string + default = "" +} + +variable "enable_nvidia_gpu" { + type = string + default = env("ENABLE_NVIDIA_GPU") +} diff --git a/aws/image/ubuntu/aws-ubuntu.pkr.hcl b/aws/image/ubuntu/aws-ubuntu.pkr.hcl index 57821f378..883588320 100644 --- a/aws/image/ubuntu/aws-ubuntu.pkr.hcl +++ b/aws/image/ubuntu/aws-ubuntu.pkr.hcl @@ -83,4 +83,41 @@ build { "sudo -E bash ~/misc-settings.sh" ] } + + # Addons + # To avoid multiple conditionals, copying the entire addons directory + # Individual addons are installed based on environment_vars by setup_addons.sh + provisioner "shell-local" { + command = "tar cf toupload/addons.tar -C ../../podvm addons" + } + + provisioner "file" { + source = "toupload" + destination = "/tmp/" + } + + provisioner "shell" { + inline = [ + "cd /tmp && tar xf toupload/addons.tar", + "rm toupload/addons.tar" + ] + } + + provisioner "file" { + source = "${var.addons_script_src}/setup_addons.sh" + destination = "~/setup_addons.sh" + } + + provisioner "shell" { + remote_folder = "~" + environment_vars = [ + "CLOUD_PROVIDER=${var.cloud_provider}", + "PODVM_DISTRO=${var.podvm_distro}", + "DISABLE_CLOUD_CONFIG=${var.disable_cloud_config}", + "ENABLE_NVIDIA_GPU=${var.enable_nvidia_gpu}" + ] + inline = [ + "sudo -E bash ~/setup_addons.sh" + ] + } } diff --git a/aws/image/ubuntu/variables.pkr.hcl b/aws/image/ubuntu/variables.pkr.hcl index 493109935..d4e52bb4d 100644 --- a/aws/image/ubuntu/variables.pkr.hcl +++ b/aws/image/ubuntu/variables.pkr.hcl @@ -53,3 +53,12 @@ variable "config_script_src" { default = "" } +variable "addons_script_src" { + type = string + default = "" +} + +variable "enable_nvidia_gpu" { + type = string + default = env("ENABLE_NVIDIA_GPU") +} diff --git a/azure/image/Makefile b/azure/image/Makefile index e9d3e8309..2c9da5c61 100644 --- a/azure/image/Makefile +++ b/azure/image/Makefile @@ -7,6 +7,8 @@ include $(ROOT_DIR)podvm/Makefile.inc .PHONY: image clean +ENABLE_NVIDIA_GPU ?= yes + image: $(IMAGE_FILE) $(IMAGE_FILE): $(BINARIES) $(FILES) @@ -27,6 +29,7 @@ $(IMAGE_FILE): $(BINARIES) $(FILES) packer init ./${PODVM_DISTRO}/ packer build \ -var config_script_src=$(ROOT_DIR)/podvm/qcow2 \ + -var addons_script_src=$(ROOT_DIR)/podvm/addons \ ./${PODVM_DISTRO}/ rm -fr toupload diff --git a/azure/image/rhel/azure-rhel.pkr.hcl b/azure/image/rhel/azure-rhel.pkr.hcl index bffc09e21..b1f3168d3 100644 --- a/azure/image/rhel/azure-rhel.pkr.hcl +++ b/azure/image/rhel/azure-rhel.pkr.hcl @@ -110,6 +110,43 @@ build { ] } + # Addons + # To avoid multiple conditionals, copying the entire addons directory + # Individual addons are installed based on environment_vars by setup_addons.sh + provisioner "shell-local" { + command = "tar cf toupload/addons.tar -C ../../podvm addons" + } + + provisioner "file" { + source = "toupload" + destination = "/tmp/" + } + + provisioner "shell" { + inline = [ + "cd /tmp && tar xf toupload/addons.tar", + "rm toupload/addons.tar" + ] + } + + provisioner "file" { + source = "${var.addons_script_src}/setup_addons.sh" + destination = "~/setup_addons.sh" + } + + provisioner "shell" { + remote_folder = "~" + environment_vars = [ + "CLOUD_PROVIDER=${var.cloud_provider}", + "PODVM_DISTRO=${var.podvm_distro}", + "DISABLE_CLOUD_CONFIG=${var.disable_cloud_config}", + "ENABLE_NVIDIA_GPU=${var.enable_nvidia_gpu}" + ] + inline = [ + "sudo -E bash ~/setup_addons.sh" + ] + } + provisioner "shell" { execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'" inline = [ diff --git a/azure/image/rhel/variables.pkr.hcl b/azure/image/rhel/variables.pkr.hcl index efeb60680..8831e4ef0 100644 --- a/azure/image/rhel/variables.pkr.hcl +++ b/azure/image/rhel/variables.pkr.hcl @@ -123,3 +123,13 @@ variable "config_script_src" { type = string default = "" } + +variable "addons_script_src" { + type = string + default = "" +} + +variable "enable_nvidia_gpu" { + type = string + default = env("ENABLE_NVIDIA_GPU") +} diff --git a/azure/image/ubuntu/azure-ubuntu.pkr.hcl b/azure/image/ubuntu/azure-ubuntu.pkr.hcl index 2172a0559..948a07827 100644 --- a/azure/image/ubuntu/azure-ubuntu.pkr.hcl +++ b/azure/image/ubuntu/azure-ubuntu.pkr.hcl @@ -90,6 +90,44 @@ build { ] } + # Addons + # To avoid multiple conditionals, copying the entire addons directory + # Individual addons are installed based on environment_vars by setup_addons.sh + provisioner "shell-local" { + command = "tar cf toupload/addons.tar -C ../../podvm addons" + } + + provisioner "file" { + source = "toupload" + destination = "/tmp/" + } + + provisioner "shell" { + inline = [ + "cd /tmp && tar xf toupload/addons.tar", + "rm toupload/addons.tar" + ] + } + + provisioner "file" { + source = "${var.addons_script_src}/setup_addons.sh" + destination = "~/setup_addons.sh" + } + + provisioner "shell" { + remote_folder = "~" + environment_vars = [ + "CLOUD_PROVIDER=${var.cloud_provider}", + "PODVM_DISTRO=${var.podvm_distro}", + "DISABLE_CLOUD_CONFIG=${var.disable_cloud_config}", + "ENABLE_NVIDIA_GPU=${var.enable_nvidia_gpu}" + ] + inline = [ + "sudo -E bash ~/setup_addons.sh" + ] + } + + provisioner "shell" { execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E sh '{{ .Path }}'" inline = [ diff --git a/azure/image/ubuntu/variables.pkr.hcl b/azure/image/ubuntu/variables.pkr.hcl index 677470490..066516fcf 100644 --- a/azure/image/ubuntu/variables.pkr.hcl +++ b/azure/image/ubuntu/variables.pkr.hcl @@ -123,3 +123,13 @@ variable "config_script_src" { type = string default = "" } + +variable "addons_script_src" { + type = string + default = "" +} + +variable "enable_nvidia_gpu" { + type = string + default = env("ENABLE_NVIDIA_GPU") +} diff --git a/podvm/addons/README.md b/podvm/addons/README.md new file mode 100644 index 000000000..933e2f2c6 --- /dev/null +++ b/podvm/addons/README.md @@ -0,0 +1,7 @@ +## Introduction + +The addons directory is used to enable different addons for the podvm image. +Each addon and its associated files (binaries, configuration etc) should be under +specific sub-dir under `addons`. + +Each addon sub-dir needs to have `setup.sh` for setting up the addon. diff --git a/podvm/addons/nvidia_gpu/README.md b/podvm/addons/nvidia_gpu/README.md new file mode 100644 index 000000000..b113b0312 --- /dev/null +++ b/podvm/addons/nvidia_gpu/README.md @@ -0,0 +1,76 @@ +## Introduction + +This addon enables nvidia GPU support in the podvm image. + +You need to specify the GPU instance types in the cloud-api-adaptor configMap (peer-pods-cm). + +Here is an example. Replace it as appropriate depending on the specific provider and region + +``` +# For AWS +PODVM_INSTANCE_TYPES: "t3.small,c5.xlarge,p3.2xlarge" + +# For Azure +AZURE_INSTANCE_SIZES: "Standard_D8as_v5,Standard_D4as_v5,Standard_NC6s_v3,Standard_NC4as_T4_v3" + +``` + +Example pod definition: +``` +apiVersion: v1 +kind: Pod +metadata: + name: gpu-test + labels: + app: test + annotations: + io.katacontainers.config.hypervisor.machine_type: Standard_NC4as_T4_v3 + io.containerd.cri.runtime-handler: kata-remote +spec: + runtimeClassName: kata-remote + containers: + - name: ubuntu + image: ubuntu + command: ["sleep"] + args: ["infinity"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" +``` + +You can verify the GPU devices by execing a shell in the pod as shown below: + +``` +$ kubectl exec -it gpu-test -- bash +root@gpu-test:/# nvidia-smi +Thu Nov 23 17:30:58 2023 ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 Tesla T4 Off | 00000001:00:00.0 Off | Off | +| N/A 36C P8 9W / 70W | 2MiB / 16384MiB | 0% Default | +| | | N/A | ++-----------------------------------------+----------------------+----------------------+ + ++---------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=======================================================================================| +| No running processes found | ++---------------------------------------------------------------------------------------+ + +root@gpu-test:/# nvidia-smi -L +GPU 0: Tesla T4 (UUID: GPU-2b9a9945-a56c-fcf3-7156-8e380cf1d0cc) + +root@gpu-test:/# ls -l /dev/nvidia* +crw-rw-rw- 1 root root 235, 0 Nov 23 17:27 /dev/nvidia-uvm +crw-rw-rw- 1 root root 235, 1 Nov 23 17:27 /dev/nvidia-uvm-tools +crw-rw-rw- 1 root root 195, 0 Nov 23 17:27 /dev/nvidia0 +crw-rw-rw- 1 root root 195, 255 Nov 23 17:27 /dev/nvidiactl + +``` diff --git a/podvm/addons/nvidia_gpu/setup.sh b/podvm/addons/nvidia_gpu/setup.sh new file mode 100755 index 000000000..29797c48e --- /dev/null +++ b/podvm/addons/nvidia_gpu/setup.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Create the prestart hook directory +mkdir -p /usr/share/oci/hooks/prestart + +# Add hook script +cat <<'END' > /usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh +#!/bin/bash -x + +/usr/bin/nvidia-container-toolkit -debug "$@" +END + +# Make the script executable +chmod +x /usr/share/oci/hooks/prestart/nvidia-container-toolkit.sh + +# PODVM_DISTRO variable is set as part of the podvm image build process +# and available inside the packer VM +# Add NVIDIA packages +if [[ "$PODVM_DISTRO" == "ubuntu" ]]; then + export DEBIAN_FRONTEND=noninteractive + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + apt-get -q update -y + apt-get -q install -y nvidia-container-toolkit + apt-get -q install -y wget build-essential pkg-config + apt-get -q install -y nvidia-driver-530 + + sed -i "s/#debug/debug/g" /etc/nvidia-container-runtime/config.toml + sed -i "s|/var/log|/var/log/nvidia-kata-container|g" /etc/nvidia-container-runtime/config.toml + sed -i "s/#no-cgroups = false/no-cgroups = true/g" /etc/nvidia-container-runtime/config.toml + sed -i "/\[nvidia-container-cli\]/a no-pivot = true" /etc/nvidia-container-runtime/config.toml + sed -i "s/disable-require = false/disable-require = true/g" /etc/nvidia-container-runtime/config.toml + + apt remove -q -y build-essential +fi +if [[ "$PODVM_DISTRO" == "rhel" ]]; then + dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + dnf install -q -y kernel-devel-"$(uname -r)" kernel-headers-"$(uname -r)" + + dnf install -q -y nvidia-container-toolkit + dnf -q -y module install nvidia-driver:latest + + sed -i "s/#debug/debug/g" /etc/nvidia-container-runtime/config.toml + sed -i "s|/var/log|/var/log/nvidia-kata-container|g" /etc/nvidia-container-runtime/config.toml + sed -i "s/#no-cgroups = false/no-cgroups = true/g" /etc/nvidia-container-runtime/config.toml + sed -i "/\[nvidia-container-cli\]/a no-pivot = true" /etc/nvidia-container-runtime/config.toml + sed -i "s/disable-require = false/disable-require = true/g" /etc/nvidia-container-runtime/config.toml + +fi diff --git a/podvm/addons/setup_addons.sh b/podvm/addons/setup_addons.sh new file mode 100755 index 000000000..74191df7f --- /dev/null +++ b/podvm/addons/setup_addons.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -euo pipefail + +# This is the dir in the pod vm image during build +ADDONS_DIR="/tmp/addons" + +# Check environment variables and execute corresponding scripts +if [[ "${ENABLE_NVIDIA_GPU}" == "yes" ]]; then + echo "Setting up Nvidia GPU" + ${ADDONS_DIR}/nvidia_gpu/setup.sh +fi