Skip to content

Commit

Permalink
Nvidia device plugin addon added
Browse files Browse the repository at this point in the history
  • Loading branch information
vara-bonthu committed Feb 19, 2024
1 parent ef74485 commit c2e8b95
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ module "eks_data_addons" {
| [helm_release.kubecost](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.kuberay_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.mlflow_tracking](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.nvidia_device_plugin](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.nvidia_gpu_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.pinot](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
| [helm_release.spark_history_server](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
Expand Down Expand Up @@ -137,6 +138,7 @@ module "eks_data_addons" {
| <a name="input_enable_kubecost"></a> [enable\_kubecost](#input\_enable\_kubecost) | Enable Kubecost add-on | `bool` | `false` | no |
| <a name="input_enable_kuberay_operator"></a> [enable\_kuberay\_operator](#input\_enable\_kuberay\_operator) | Enable Kuberay Operator add-on | `bool` | `false` | no |
| <a name="input_enable_mlflow_tracking"></a> [enable\_mlflow\_tracking](#input\_enable\_mlflow\_tracking) | Enable MLflow Tracking add-on | `bool` | `false` | no |
| <a name="input_enable_nvidia_device_plugin"></a> [enable\_nvidia\_device\_plugin](#input\_enable\_nvidia\_device\_plugin) | Enable NVIDIA Device Plugin add-on | `bool` | `false` | no |
| <a name="input_enable_nvidia_gpu_operator"></a> [enable\_nvidia\_gpu\_operator](#input\_enable\_nvidia\_gpu\_operator) | Enable NVIDIA GPU Operator add-on | `bool` | `false` | no |
| <a name="input_enable_pinot"></a> [enable\_pinot](#input\_enable\_pinot) | Enable Apache Pinot Add-On | `bool` | `false` | no |
| <a name="input_enable_spark_history_server"></a> [enable\_spark\_history\_server](#input\_enable\_spark\_history\_server) | Enable Spark History Server add-on | `bool` | `false` | no |
Expand All @@ -150,6 +152,7 @@ module "eks_data_addons" {
| <a name="input_kubecost_helm_config"></a> [kubecost\_helm\_config](#input\_kubecost\_helm\_config) | Kubecost Helm Chart config | `any` | `{}` | no |
| <a name="input_kuberay_operator_helm_config"></a> [kuberay\_operator\_helm\_config](#input\_kuberay\_operator\_helm\_config) | Helm configuration for Kuberay Operator | `any` | `{}` | no |
| <a name="input_mlflow_tracking_helm_config"></a> [mlflow\_tracking\_helm\_config](#input\_mlflow\_tracking\_helm\_config) | MLflow Tracking add-on Helm Chart config | `any` | `{}` | no |
| <a name="input_nvidia_device_plugin_helm_config"></a> [nvidia\_device\_plugin\_helm\_config](#input\_nvidia\_device\_plugin\_helm\_config) | NVIDIA Device Plugin Helm Chart config | `any` | `{}` | no |
| <a name="input_nvidia_gpu_operator_helm_config"></a> [nvidia\_gpu\_operator\_helm\_config](#input\_nvidia\_gpu\_operator\_helm\_config) | Helm configuration for NVIDIA GPU Operator | `any` | `{}` | no |
| <a name="input_oidc_provider_arn"></a> [oidc\_provider\_arn](#input\_oidc\_provider\_arn) | The ARN of the cluster OIDC Provider | `string` | n/a | yes |
| <a name="input_pinot_helm_config"></a> [pinot\_helm\_config](#input\_pinot\_helm\_config) | Apache Pinot Helm Chart config | `any` | `{}` | no |
Expand Down
82 changes: 82 additions & 0 deletions nvidia-device-plugin.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
locals {
nvidia_device_plugin_default_values = <<-EOT
gfd:
enabled: true
nfd:
enabled: true
worker:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- operator: "Exists"
EOT

nvidia_device_plugin_merged_values_yaml = yamlencode(merge(
yamldecode(local.nvidia_device_plugin_default_values),
try(yamldecode(var.nvidia_device_plugin_helm_config.values[0]), {})
))
}

resource "helm_release" "nvidia_device_plugin" {
count = var.enable_nvidia_device_plugin ? 1 : 0

name = try(var.nvidia_device_plugin_helm_config["name"], "neuron-device-plugin")
repository = try(var.nvidia_device_plugin_helm_config["repository"], "https://nvidia.github.io/k8s-device-plugin")
chart = try(var.nvidia_device_plugin_helm_config["chart"], "nvidia-device-plugin")
version = try(var.nvidia_device_plugin_helm_config["version"], "0.14.4")
timeout = try(var.nvidia_device_plugin_helm_config["timeout"], 300)
values = [local.nvidia_device_plugin_merged_values_yaml]
create_namespace = try(var.nvidia_device_plugin_helm_config["create_namespace"], true)
namespace = try(var.nvidia_device_plugin_helm_config["namespace"], "nvidia-device-plugin")
lint = try(var.nvidia_device_plugin_helm_config["lint"], false)
description = try(var.nvidia_device_plugin_helm_config["description"], "")
repository_key_file = try(var.nvidia_device_plugin_helm_config["repository_key_file"], "")
repository_cert_file = try(var.nvidia_device_plugin_helm_config["repository_cert_file"], "")
repository_username = try(var.nvidia_device_plugin_helm_config["repository_username"], "")
repository_password = try(var.nvidia_device_plugin_helm_config["repository_password"], "")
verify = try(var.nvidia_device_plugin_helm_config["verify"], false)
keyring = try(var.nvidia_device_plugin_helm_config["keyring"], "")
disable_webhooks = try(var.nvidia_device_plugin_helm_config["disable_webhooks"], false)
reuse_values = try(var.nvidia_device_plugin_helm_config["reuse_values"], false)
reset_values = try(var.nvidia_device_plugin_helm_config["reset_values"], false)
force_update = try(var.nvidia_device_plugin_helm_config["force_update"], false)
recreate_pods = try(var.nvidia_device_plugin_helm_config["recreate_pods"], false)
cleanup_on_fail = try(var.nvidia_device_plugin_helm_config["cleanup_on_fail"], false)
max_history = try(var.nvidia_device_plugin_helm_config["max_history"], 0)
atomic = try(var.nvidia_device_plugin_helm_config["atomic"], false)
skip_crds = try(var.nvidia_device_plugin_helm_config["skip_crds"], false)
render_subchart_notes = try(var.nvidia_device_plugin_helm_config["render_subchart_notes"], true)
disable_openapi_validation = try(var.nvidia_device_plugin_helm_config["disable_openapi_validation"], false)
wait = try(var.nvidia_device_plugin_helm_config["wait"], true)
wait_for_jobs = try(var.nvidia_device_plugin_helm_config["wait_for_jobs"], false)
dependency_update = try(var.nvidia_device_plugin_helm_config["dependency_update"], false)
replace = try(var.nvidia_device_plugin_helm_config["replace"], false)

postrender {
binary_path = try(var.nvidia_device_plugin_helm_config["postrender"], "")
}

dynamic "set" {
iterator = each_item
for_each = try(var.nvidia_device_plugin_helm_config["set"], [])

content {
name = each_item.value.name
value = each_item.value.value
type = try(each_item.value.type, null)
}
}

dynamic "set_sensitive" {
iterator = each_item
for_each = try(var.nvidia_device_plugin_helm_config["set_sensitive"], [])

content {
name = each_item.value.name
value = each_item.value.value
type = try(each_item.value.type, null)
}
}

}
15 changes: 15 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -319,3 +319,18 @@ variable "karpenter_resources_helm_config" {
type = any
default = {}
}

#---------------------------------------------------
# NVIDIA Device Plugin
#---------------------------------------------------
variable "enable_nvidia_device_plugin" {
description = "Enable NVIDIA Device Plugin add-on"
type = bool
default = false
}

variable "nvidia_device_plugin_helm_config" {
description = "NVIDIA Device Plugin Helm Chart config"
type = any
default = {}
}

0 comments on commit c2e8b95

Please sign in to comment.