update: operator install

redhat-na-ssa · Apr 2, 2024 · 4e25da0 · 4e25da0
1 parent dc0a9ae
commit 4e25da0
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 28 deletions.
diff --git a/components/operators/gpu-operator-certified/instance/base/aws/kustomization.yaml b/components/operators/gpu-operator-certified/instance/base/aws/kustomization.yaml
@@ -4,4 +4,4 @@ kind: Kustomization
 namespace: nvidia-gpu-operator
 
 resources:
-  - setup-job.yaml
+  - setup-machineset.yaml
diff --git a/...ertified/instance/base/aws/setup-job.yaml → ...d/instance/base/aws/setup-machineset.yaml b/...ertified/instance/base/aws/setup-job.yaml → ...d/instance/base/aws/setup-machineset.yaml
@@ -77,59 +77,75 @@ spec:
                 oc -n kube-system get secret/aws-creds -o name > /dev/null 2>&1 || return 1
               }
 
+              ocp_aws_clone_machineset(){
+                [ -z "${1}" ] && \
+                echo "
+                  usage: ocp_aws_create_gpu_machineset < instance type, default g4dn.4xlarge >
+                "
+
+                INSTANCE_TYPE=${1:-g4dn.4xlarge}
+                MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1)
+
+                # check for an existing instance machine set
+                if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep -q "${INSTANCE_TYPE%.*}"; then
+                  echo "Exists: machineset - ${INSTANCE_TYPE}"
+                else
+                  echo "Creating: machineset - ${INSTANCE_TYPE}"
+                  oc -n openshift-machine-api \
+                    get "${MACHINE_SET}" -o yaml | \
+                      sed '/machine/ s/-worker/-'"${INSTANCE_TYPE}"'/g
+                        /name/ s/-worker/-'"${INSTANCE_TYPE%.*}"'/g
+                        s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/
+                        s/replicas.*/replicas: 0/' | \
+                    oc apply -f -
+                fi
+              }
+
               ocp_aws_create_gpu_machineset(){
                 # https://aws.amazon.com/ec2/instance-types/g4
                 # single gpu: g4dn.{2,4,8,16}xlarge
-                # multi gpu: g4dn.12xlarge
-                # cheapest: g4ad.4xlarge
+                # multi gpu:  g4dn.12xlarge
+                # practical:  g4ad.4xlarge
                 # a100 (MIG): p4d.24xlarge
                 # h100 (MIG): p5.48xlarge
+
+                # https://aws.amazon.com/ec2/instance-types/dl1
+                # 8 x gaudi:  dl1.24xlarge
+
                 INSTANCE_TYPE=${1:-g4dn.4xlarge}
-                MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1)
 
-                # check for an existing gpu machine set
-                if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep gpu; then
-                  echo "Exists: GPU machineset"
-                else
-                  echo "Creating: GPU machineset"
-                  oc -n openshift-machine-api get "${MACHINE_SET}" -o yaml | \
-                    sed '/machine/ s/-worker/-gpu/g
-                      /name/ s/-worker/-gpu/g
-                      s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/
-                      s/replicas.*/replicas: 0/' | \
-                    oc apply -f -
-                fi
+                ocp_aws_clone_machineset "${INSTANCE_TYPE}"
 
-                MACHINE_SET_GPU=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep gpu | head -n1)
+                MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
 
-                echo "Patching: GPU machineset"
+                echo "Patching: ${MACHINE_SET_TYPE}"
 
                 # cosmetic
                 oc -n openshift-machine-api \
-                  patch "${MACHINE_SET_GPU}" \
+                  patch "${MACHINE_SET_TYPE}" \
                   --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'
 
                 # taint nodes for gpu-only workloads
-                # oc -n openshift-machine-api \
-                #   patch "${MACHINE_SET_GPU}" \
-                #   --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"gpu-required","value":"yes","effect":"NoSchedule"}]}}}}'
+                oc -n openshift-machine-api \
+                  patch "${MACHINE_SET_TYPE}" \
+                  --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'
 
                 # should use the default profile
                 # oc -n openshift-machine-api \
-                #   patch "${MACHINE_SET_GPU}" \
+                #   patch "${MACHINE_SET_TYPE}" \
                 #   --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"nvidia.com/device-plugin.config":"no-time-sliced"}}}}}}'
 
                 # should help auto provisioner
                 oc -n openshift-machine-api \
-                  patch "${MACHINE_SET_GPU}" \
+                  patch "${MACHINE_SET_TYPE}" \
                   --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}}}}'
 
                   oc -n openshift-machine-api \
-                  patch "${MACHINE_SET_GPU}" \
+                  patch "${MACHINE_SET_TYPE}" \
                   --type=merge --patch '{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}'
 
                 oc -n openshift-machine-api \
-                  patch "${MACHINE_SET_GPU}" \
+                  patch "${MACHINE_SET_TYPE}" \
                   --type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}'
               }
 

diff --git a/components/operators/gpu-operator-certified/instance/base/cluster-policy.yaml b/components/operators/gpu-operator-certified/instance/base/cluster-policy.yaml
@@ -53,7 +53,7 @@ spec:
       maxUnavailable: '1'
     tolerations:
       - effect: NoSchedule
-        key: gpu-required
+        key: nvidia-gpu-only
         operator: Exists
   devicePlugin:
     enabled: true

diff --git a/components/operators/gpu-operator-certified/operator/base/operator-group.yaml b/components/operators/gpu-operator-certified/operator/base/operator-group.yaml
@@ -1,7 +1,7 @@
 apiVersion: operators.coreos.com/v1
 kind: OperatorGroup
 metadata:
-  name: gpu-operator-certified-group
+  name: gpu-operator-certified
   namespace: nvidia-gpu-operator
 spec:
   targetNamespaces: