From c2e8b953b27ceea09be4c787f79cd4bc484ef8bb Mon Sep 17 00:00:00 2001 From: Vara Bonthu Date: Sun, 18 Feb 2024 17:19:44 -0800 Subject: [PATCH] Nvidia device plugin addon added --- README.md | 3 ++ nvidia-device-plugin.tf | 82 +++++++++++++++++++++++++++++++++++++++++ variables.tf | 15 ++++++++ 3 files changed, 100 insertions(+) create mode 100644 nvidia-device-plugin.tf diff --git a/README.md b/README.md index e71ea94..a1eee9e 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,7 @@ module "eks_data_addons" { | [helm_release.kubecost](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.kuberay_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.mlflow_tracking](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | +| [helm_release.nvidia_device_plugin](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.nvidia_gpu_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.pinot](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | | [helm_release.spark_history_server](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | @@ -137,6 +138,7 @@ module "eks_data_addons" { | [enable\_kubecost](#input\_enable\_kubecost) | Enable Kubecost add-on | `bool` | `false` | no | | [enable\_kuberay\_operator](#input\_enable\_kuberay\_operator) | Enable Kuberay Operator add-on | `bool` | `false` | no | | [enable\_mlflow\_tracking](#input\_enable\_mlflow\_tracking) | Enable MLflow Tracking add-on | `bool` | `false` | no | +| [enable\_nvidia\_device\_plugin](#input\_enable\_nvidia\_device\_plugin) | Enable NVIDIA Device Plugin add-on | `bool` | `false` | no | | [enable\_nvidia\_gpu\_operator](#input\_enable\_nvidia\_gpu\_operator) | Enable NVIDIA GPU Operator add-on | `bool` | `false` | no | | [enable\_pinot](#input\_enable\_pinot) | Enable Apache Pinot Add-On | `bool` | `false` | no | | [enable\_spark\_history\_server](#input\_enable\_spark\_history\_server) | Enable Spark History Server add-on | `bool` | `false` | no | @@ -150,6 +152,7 @@ module "eks_data_addons" { | [kubecost\_helm\_config](#input\_kubecost\_helm\_config) | Kubecost Helm Chart config | `any` | `{}` | no | | [kuberay\_operator\_helm\_config](#input\_kuberay\_operator\_helm\_config) | Helm configuration for Kuberay Operator | `any` | `{}` | no | | [mlflow\_tracking\_helm\_config](#input\_mlflow\_tracking\_helm\_config) | MLflow Tracking add-on Helm Chart config | `any` | `{}` | no | +| [nvidia\_device\_plugin\_helm\_config](#input\_nvidia\_device\_plugin\_helm\_config) | NVIDIA Device Plugin Helm Chart config | `any` | `{}` | no | | [nvidia\_gpu\_operator\_helm\_config](#input\_nvidia\_gpu\_operator\_helm\_config) | Helm configuration for NVIDIA GPU Operator | `any` | `{}` | no | | [oidc\_provider\_arn](#input\_oidc\_provider\_arn) | The ARN of the cluster OIDC Provider | `string` | n/a | yes | | [pinot\_helm\_config](#input\_pinot\_helm\_config) | Apache Pinot Helm Chart config | `any` | `{}` | no | diff --git a/nvidia-device-plugin.tf b/nvidia-device-plugin.tf new file mode 100644 index 0000000..23f84a7 --- /dev/null +++ b/nvidia-device-plugin.tf @@ -0,0 +1,82 @@ +locals { + nvidia_device_plugin_default_values = <<-EOT +gfd: + enabled: true +nfd: + enabled: true + worker: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - operator: "Exists" +EOT + + nvidia_device_plugin_merged_values_yaml = yamlencode(merge( + yamldecode(local.nvidia_device_plugin_default_values), + try(yamldecode(var.nvidia_device_plugin_helm_config.values[0]), {}) + )) +} + +resource "helm_release" "nvidia_device_plugin" { + count = var.enable_nvidia_device_plugin ? 1 : 0 + + name = try(var.nvidia_device_plugin_helm_config["name"], "neuron-device-plugin") + repository = try(var.nvidia_device_plugin_helm_config["repository"], "https://nvidia.github.io/k8s-device-plugin") + chart = try(var.nvidia_device_plugin_helm_config["chart"], "nvidia-device-plugin") + version = try(var.nvidia_device_plugin_helm_config["version"], "0.14.4") + timeout = try(var.nvidia_device_plugin_helm_config["timeout"], 300) + values = [local.nvidia_device_plugin_merged_values_yaml] + create_namespace = try(var.nvidia_device_plugin_helm_config["create_namespace"], true) + namespace = try(var.nvidia_device_plugin_helm_config["namespace"], "nvidia-device-plugin") + lint = try(var.nvidia_device_plugin_helm_config["lint"], false) + description = try(var.nvidia_device_plugin_helm_config["description"], "") + repository_key_file = try(var.nvidia_device_plugin_helm_config["repository_key_file"], "") + repository_cert_file = try(var.nvidia_device_plugin_helm_config["repository_cert_file"], "") + repository_username = try(var.nvidia_device_plugin_helm_config["repository_username"], "") + repository_password = try(var.nvidia_device_plugin_helm_config["repository_password"], "") + verify = try(var.nvidia_device_plugin_helm_config["verify"], false) + keyring = try(var.nvidia_device_plugin_helm_config["keyring"], "") + disable_webhooks = try(var.nvidia_device_plugin_helm_config["disable_webhooks"], false) + reuse_values = try(var.nvidia_device_plugin_helm_config["reuse_values"], false) + reset_values = try(var.nvidia_device_plugin_helm_config["reset_values"], false) + force_update = try(var.nvidia_device_plugin_helm_config["force_update"], false) + recreate_pods = try(var.nvidia_device_plugin_helm_config["recreate_pods"], false) + cleanup_on_fail = try(var.nvidia_device_plugin_helm_config["cleanup_on_fail"], false) + max_history = try(var.nvidia_device_plugin_helm_config["max_history"], 0) + atomic = try(var.nvidia_device_plugin_helm_config["atomic"], false) + skip_crds = try(var.nvidia_device_plugin_helm_config["skip_crds"], false) + render_subchart_notes = try(var.nvidia_device_plugin_helm_config["render_subchart_notes"], true) + disable_openapi_validation = try(var.nvidia_device_plugin_helm_config["disable_openapi_validation"], false) + wait = try(var.nvidia_device_plugin_helm_config["wait"], true) + wait_for_jobs = try(var.nvidia_device_plugin_helm_config["wait_for_jobs"], false) + dependency_update = try(var.nvidia_device_plugin_helm_config["dependency_update"], false) + replace = try(var.nvidia_device_plugin_helm_config["replace"], false) + + postrender { + binary_path = try(var.nvidia_device_plugin_helm_config["postrender"], "") + } + + dynamic "set" { + iterator = each_item + for_each = try(var.nvidia_device_plugin_helm_config["set"], []) + + content { + name = each_item.value.name + value = each_item.value.value + type = try(each_item.value.type, null) + } + } + + dynamic "set_sensitive" { + iterator = each_item + for_each = try(var.nvidia_device_plugin_helm_config["set_sensitive"], []) + + content { + name = each_item.value.name + value = each_item.value.value + type = try(each_item.value.type, null) + } + } + +} diff --git a/variables.tf b/variables.tf index 2ed6291..5decad9 100644 --- a/variables.tf +++ b/variables.tf @@ -319,3 +319,18 @@ variable "karpenter_resources_helm_config" { type = any default = {} } + +#--------------------------------------------------- +# NVIDIA Device Plugin +#--------------------------------------------------- +variable "enable_nvidia_device_plugin" { + description = "Enable NVIDIA Device Plugin add-on" + type = bool + default = false +} + +variable "nvidia_device_plugin_helm_config" { + description = "NVIDIA Device Plugin Helm Chart config" + type = any + default = {} +}