Skip to content

Commit

Permalink
update: operator install
Browse files Browse the repository at this point in the history
  • Loading branch information
codekow committed Apr 2, 2024
1 parent dc0a9ae commit 4e25da0
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ kind: Kustomization
namespace: nvidia-gpu-operator

resources:
- setup-job.yaml
- setup-machineset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,59 +77,75 @@ spec:
oc -n kube-system get secret/aws-creds -o name > /dev/null 2>&1 || return 1
}
ocp_aws_clone_machineset(){
[ -z "${1}" ] && \
echo "
usage: ocp_aws_create_gpu_machineset < instance type, default g4dn.4xlarge >
"
INSTANCE_TYPE=${1:-g4dn.4xlarge}
MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1)
# check for an existing instance machine set
if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep -q "${INSTANCE_TYPE%.*}"; then
echo "Exists: machineset - ${INSTANCE_TYPE}"
else
echo "Creating: machineset - ${INSTANCE_TYPE}"
oc -n openshift-machine-api \
get "${MACHINE_SET}" -o yaml | \
sed '/machine/ s/-worker/-'"${INSTANCE_TYPE}"'/g
/name/ s/-worker/-'"${INSTANCE_TYPE%.*}"'/g
s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/
s/replicas.*/replicas: 0/' | \
oc apply -f -
fi
}
ocp_aws_create_gpu_machineset(){
# https://aws.amazon.com/ec2/instance-types/g4
# single gpu: g4dn.{2,4,8,16}xlarge
# multi gpu: g4dn.12xlarge
# cheapest: g4ad.4xlarge
# multi gpu: g4dn.12xlarge
# practical: g4ad.4xlarge
# a100 (MIG): p4d.24xlarge
# h100 (MIG): p5.48xlarge
# https://aws.amazon.com/ec2/instance-types/dl1
# 8 x gaudi: dl1.24xlarge
INSTANCE_TYPE=${1:-g4dn.4xlarge}
MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1)
# check for an existing gpu machine set
if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep gpu; then
echo "Exists: GPU machineset"
else
echo "Creating: GPU machineset"
oc -n openshift-machine-api get "${MACHINE_SET}" -o yaml | \
sed '/machine/ s/-worker/-gpu/g
/name/ s/-worker/-gpu/g
s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/
s/replicas.*/replicas: 0/' | \
oc apply -f -
fi
ocp_aws_clone_machineset "${INSTANCE_TYPE}"
MACHINE_SET_GPU=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep gpu | head -n1)
MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
echo "Patching: GPU machineset"
echo "Patching: ${MACHINE_SET_TYPE}"
# cosmetic
oc -n openshift-machine-api \
patch "${MACHINE_SET_GPU}" \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'
# taint nodes for gpu-only workloads
# oc -n openshift-machine-api \
# patch "${MACHINE_SET_GPU}" \
# --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"gpu-required","value":"yes","effect":"NoSchedule"}]}}}}'
oc -n openshift-machine-api \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'
# should use the default profile
# oc -n openshift-machine-api \
# patch "${MACHINE_SET_GPU}" \
# patch "${MACHINE_SET_TYPE}" \
# --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"nvidia.com/device-plugin.config":"no-time-sliced"}}}}}}'
# should help auto provisioner
oc -n openshift-machine-api \
patch "${MACHINE_SET_GPU}" \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}}}}'
oc -n openshift-machine-api \
patch "${MACHINE_SET_GPU}" \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}'
oc -n openshift-machine-api \
patch "${MACHINE_SET_GPU}" \
patch "${MACHINE_SET_TYPE}" \
--type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}'
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ spec:
maxUnavailable: '1'
tolerations:
- effect: NoSchedule
key: gpu-required
key: nvidia-gpu-only
operator: Exists
devicePlugin:
enabled: true
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: operators.coreos.com/v1
kind: OperatorGroup
metadata:
name: gpu-operator-certified-group
name: gpu-operator-certified
namespace: nvidia-gpu-operator
spec:
targetNamespaces:
Expand Down

0 comments on commit 4e25da0

Please sign in to comment.