redhat-na-ssa · dmarcus-wire · Jul 17, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/.wordlist-md b/.wordlist-md
diff --git a/README.md b/README.md
@@ -3,19 +3,19 @@
 This is the Hobbyist Guide to Installing and Configuring RHOAI for customers. Bring your towel. This repo is intentionally imperative to aggregate the various official docs into a single markdown and paves the way for declarative automation in the [ai-gitops-catalog](https://github.com/redhat-na-ssa/demo-ai-gitops-catalog).
 
 - OCP Instance: AWS with OpenShift Open Environment
-- OCP Version: 4.14.27
-- RHOAI Version: 2.9.1
+- OCP Version: 4.15
+- RHOAI Version: 2.10
 
 ```shell
 .
 ├── LICENSE
 ├── README.md
 ├── notes
-│   ├── 00_FEATURES.md                  # Overview of the features in RHOAI 2.9
-│   ├── 01_DASHBOARD.md                 # Deep dive into the RHOAI 2.9 Dashboard
+│   ├── 00_FEATURES.md                  # Overview of the features in RHOAI
+│   ├── 01_DASHBOARD.md                 # Deep dive into the RHOAI dashboard
 │   ├── 02_CHECKLIST.md                 # Technical overview for RHOAI install/config
-│   ├── 03_CHECKLIST_PROCEDURE.md       # Detailed steps that accompany 02_CHECKLIST.md
-│   ├── 04_TUTORIAL_FRAUD.md            # Notes for the Fraud Detection demo
-│   ├── 05_TUTORIAL_DISTR_WORKLOADS.md  # Notes for the Distributed Workloads demo
+│   ├── 03_CHECKLIST_PROCEDURE.md       # Additional detailed steps
+│   ├── 04_TUTORIAL_FRAUD.md            # Notes for the fraud detection demo
+│   ├── 05_TUTORIAL_DISTR_WORKLOADS.md  # Notes for the distributed workloads demo
 │   └── configs                         # These are the config files used in the 03_CHECKLIST_PROCEDURE.md
 ```
diff --git a/configs/authorino-instance-ns.yaml b/configs/authorino-instance-ns.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: redhat-ods-applications-auth-provider
diff --git a/configs/authorino-subscription.yaml b/configs/authorino-subscription.yaml
@@ -9,4 +9,4 @@ spec:
   name: authorino-operator
   source: redhat-operators
   sourceNamespace: openshift-marketplace
-  startingCSV: authorino-operator.v1.0.1
+  # startingCSV: authorino-operator.v1.0.1
diff --git a/configs/nvidia-dcgm-dashboard-cm.json → configs/files/nvidia-dcgm-dashboard.json b/configs/nvidia-dcgm-dashboard-cm.json → configs/files/nvidia-dcgm-dashboard.json
diff --git a/configs/files/ocp-machineset.yaml b/configs/files/ocp-machineset.yaml
@@ -0,0 +1,73 @@
+apiVersion: machine.openshift.io/v1beta1
+kind: MachineSet
+metadata:
+  annotations:
+    capacity.cluster-autoscaler.kubernetes.io/labels: kubernetes.io/arch=amd64
+    machine.openshift.io/GPU: "0"
+    machine.openshift.io/memoryMb: "16384"
+    machine.openshift.io/vCPU: "4"
+  creationTimestamp: "2024-05-28T17:18:56Z"
+  generation: 2
+  labels:
+    machine.openshift.io/cluster-api-cluster: rhoai29-cd8g7
+  name: rhoai29-cd8g7-worker-us-east-2a-gpu
+  namespace: openshift-machine-api
+  resourceVersion: "629586"
+  uid: eeb16140-46fa-4363-8792-1a0022699bb8
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      machine.openshift.io/cluster-api-cluster: rhoai29-cd8g7
+      machine.openshift.io/cluster-api-machineset: rhoai29-cd8g7-worker-us-east-2a-gpu
+  template:
+    metadata:
+      labels:
+        machine.openshift.io/cluster-api-cluster: rhoai29-cd8g7
+        machine.openshift.io/cluster-api-machine-role: worker
+        machine.openshift.io/cluster-api-machine-type: worker
+        machine.openshift.io/cluster-api-machineset: rhoai29-cd8g7-worker-us-east-2a-gpu
+    spec:
+      lifecycleHooks: {}
+      metadata: {}
+      providerSpec:
+        value:
+          ami:
+            id: ami-049d8fda91038a0fd
+          apiVersion: machine.openshift.io/v1beta1
+          blockDevices:
+            - ebs:
+                encrypted: true
+                iops: 0
+                kmsKey:
+                  arn: ""
+                volumeSize: 120
+                volumeType: gp3
+          credentialsSecret:
+            name: aws-cloud-credentials
+          deviceIndex: 0
+          iamInstanceProfile:
+            id: rhoai29-cd8g7-worker-profile
+          instanceType: g4dn.xlarge
+          kind: AWSMachineProviderConfig
+          metadata:
+            creationTimestamp: null
+          metadataServiceOptions: {}
+          placement:
+            availabilityZone: us-east-2a
+            region: us-east-2
+          securityGroups:
+            - filters:
+                - name: tag:Name
+                  values:
+                    - rhoai29-cd8g7-worker-sg
+          subnet:
+            filters:
+              - name: tag:Name
+                values:
+                  - rhoai29-cd8g7-private-us-east-2a
+          tags:
+            - name: kubernetes.io/cluster/rhoai29-cd8g7
+              value: owned
+          userDataSecret:
+            name: worker-user-data
diff --git a/configs/servicemesh-smcp-patch.yaml → configs/files/servicemesh-smcp-patch.yaml b/configs/servicemesh-smcp-patch.yaml → configs/files/servicemesh-smcp-patch.yaml
diff --git a/configs/fix-kubeadmin.yaml b/configs/fix-kubeadmin.yaml
@@ -0,0 +1,12 @@
+kind: ClusterRoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: fix-rhoai-kubeadmin
+subjects:
+  - kind: User
+    apiGroup: rbac.authorization.k8s.io
+    name: 'kube:admin'
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-admin
diff --git a/configs/functions.sh b/configs/functions.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+
+ocp_control_nodes_not_schedulable(){
+  oc patch schedulers.config.openshift.io/cluster --type merge --patch '{"spec":{"mastersSchedulable": false}}'
+}
+
+ocp_control_nodes_schedulable(){
+  oc patch schedulers.config.openshift.io/cluster --type merge --patch '{"spec":{"mastersSchedulable": true}}'
+}
+
+ocp_gpu_taint_nodes(){
+  oc adm taint node -l node-role.kubernetes.io/gpu nvidia-gpu-only=:NoSchedule --overwrite
+  oc adm drain -l node-role.kubernetes.io/gpu --ignore-daemonsets --delete-emptydir-data
+  oc adm uncordon -l node-role.kubernetes.io/gpu
+}
+
+ocp_gpu_untaint_nodes(){
+  oc adm taint node -l node-role.kubernetes.io/gpu nvidia-gpu-only=:NoSchedule-
+}
+
+ocp_gpu_label_nodes_from_nfd(){
+  oc label node -l nvidia.com/gpu.machine node-role.kubernetes.io/gpu=''
+}
+
+ocp_aws_clone_worker_machineset(){
+  [ -z "${1}" ] && \
+  echo "
+    usage: ocp_aws_clone_worker_machineset < instance type, default g4dn.4xlarge > < machine set name >
+  "
+
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+  SHORT_NAME=${2:-${INSTANCE_TYPE%.*}}
+
+  MACHINE_SET_NAME=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${SHORT_NAME}" | head -n1)
+  MACHINE_SET_WORKER=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1)
+
+  # check for an existing instance machine set
+  if [ -n "${MACHINE_SET_NAME}" ]; then
+    echo "Exists: machineset - ${MACHINE_SET_NAME}"
+  else
+    echo "Creating: machineset - ${SHORT_NAME}"
+    oc -n openshift-machine-api \
+      get "${MACHINE_SET_WORKER}" -o yaml | \
+        sed '/machine/ s/-worker/-'"${INSTANCE_TYPE}"'/g
+          /^  name:/ s/cluster-.*/'"${SHORT_NAME}"'/g
+          /name/ s/-worker/-'"${SHORT_NAME}"'/g
+          s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/
+          /cluster-api-autoscaler/d
+          s/replicas.*/replicas: 0/' | \
+      oc apply -f -
+  fi
+
+  # cosmetic pretty
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_NAME}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/'"${SHORT_NAME}"'":""}}}}}}'
+}
+
+ocp_aws_cluster_autoscaling(){
+  oc apply -k https://github.com/redhat-na-ssa/demo-ai-gitops-catalog/components/configs/cluster/autoscale/overlays/gpus-accelerator-label?ref=v0.04
+
+  ocp_aws_create_gpu_machineset g4dn.4xlarge
+  ocp_create_machineset_autoscale 0 3
+
+  # scale workers to 1
+  WORKER_MS="$(oc -n openshift-machine-api get machineset -o name | grep worker)"
+  ocp_scale_machineset 1 "${WORKER_MS}"
+
+  ocp_control_nodes_not_schedulable
+}
+
+ocp_aws_create_gpu_machineset(){
+  # https://aws.amazon.com/ec2/instance-types/g4
+  # single gpu: g4dn.{2,4,8,16}xlarge
+  # multi gpu:  g4dn.12xlarge
+  # practical:  g4ad.4xlarge
+  # a100 (MIG): p4d.24xlarge
+  # h100 (MIG): p5.48xlarge
+
+  # https://aws.amazon.com/ec2/instance-types/dl1
+  # 8 x gaudi:  dl1.24xlarge
+
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+
+  ocp_aws_clone_worker_machineset "${INSTANCE_TYPE}"
+
+  MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
+
+  echo "Patching: ${MACHINE_SET_TYPE}"
+
+  # cosmetic
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'
+
+  # taint nodes for gpu-only workloads
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'
+
+  # should use the default profile
+  # oc -n openshift-machine-api \
+  #   patch "${MACHINE_SET_TYPE}" \
+  #   --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"nvidia.com/device-plugin.config":"no-time-sliced"}}}}}}'
+
+  # should help auto provisioner
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}}}}'
+
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}'
+
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}'
+}
+
+ocp_create_machineset_autoscale(){
+  MACHINE_MIN=${1:-0}
+  MACHINE_MAX=${2:-4}
+  MACHINE_SETS=${3:-$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}
+
+  for set in ${MACHINE_SETS}
+  do
+cat << YAML | oc apply -f -
+apiVersion: "autoscaling.openshift.io/v1beta1"
+kind: "MachineAutoscaler"
+metadata:
+  name: "${set}"
+  namespace: "openshift-machine-api"
+spec:
+  minReplicas: ${MACHINE_MIN}
+  maxReplicas: ${MACHINE_MAX}
+  scaleTargetRef:
+    apiVersion: machine.openshift.io/v1beta1
+    kind: MachineSet
+    name: "${set}"
+YAML
+  done
+}
+
+ocp_scale_machineset(){
+  REPLICAS=${1:-1}
+  MACHINE_SETS=${2:-$(oc -n openshift-machine-api get machineset -o name)}
+
+  # scale workers
+  echo "${MACHINE_SETS}" | \
+    xargs \
+      oc -n openshift-machine-api \
+      scale --replicas="${REPLICAS}"
+}
diff --git a/configs/htpass-secret.yaml b/configs/htpass-secret.yaml
diff --git a/configs/htpass-admin-rolebinding.yaml → configs/htpasswd-admin-rolebinding.yaml b/configs/htpass-admin-rolebinding.yaml → configs/htpasswd-admin-rolebinding.yaml
diff --git a/configs/htpass-cr.yaml → configs/htpasswd-cr.yaml b/configs/htpass-cr.yaml → configs/htpasswd-cr.yaml
@@ -5,11 +5,11 @@ metadata:
 spec:
   identityProviders:
     # This provider name is prefixed to provider user names to form an identity name.
-  - name: my_htpasswd_provider
+  - name: htpasswd
     # Controls how mappings are established between this provider’s identities and User objects.
     mappingMethod: claim
     type: HTPasswd
     htpasswd:
       fileData:
         # An existing secret containing a file generated using htpasswd.
-        name: htpass-secret
+        name: htpasswd-secret
diff --git a/configs/htpasswd-secret.yaml b/configs/htpasswd-secret.yaml
@@ -0,0 +1,9 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: htpasswd-secret
+  namespace: openshift-config
+type: Opaque
+stringData:
+  htpasswd: |
+    # <htpasswd_file_contents>
diff --git a/configs/nfd-instance.yaml b/configs/nfd-instance.yaml
@@ -4,8 +4,6 @@ metadata:
   name: nfd-instance
   namespace: openshift-nfd
 spec:
-  customConfig:
-    configData:
   operand:
     image: 'registry.redhat.io/openshift4/ose-node-feature-discovery@sha256:96984b49c21fa4b76e8ca26735521a0a32daa4c5e330397641fe47ae4d774df4'
     servicePort: 12000