Skip to content
This repository has been archived by the owner on Aug 16, 2023. It is now read-only.

gpu_operator_set_repo-config: new role to set spec.driver.repoConfig #124

Merged
merged 4 commits into from
May 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions build/root/usr/local/bin/ci_entrypoint_gpu-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ collect_must_gather() {
validate_gpu_operator_deployment() {
trap collect_must_gather ERR EXIT

if oc version | grep -q "Server Version: 4.8"; then
echo "Running on OCP 4.8, enabling RHEL beta repository"
./toolbox/gpu-operator/set_repo-config.sh --rhel-beta
fi

toolbox/gpu-operator/wait_deployment.sh
toolbox/gpu-operator/run_gpu_burn.sh
}
Expand Down
15 changes: 13 additions & 2 deletions build/root/usr/local/bin/entitle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,17 @@ else
entitlement_deployed=1
fi

if oc version | grep -q "Server Version: 4.8"; then
# Random CA file until we have the right certificate stored as a secret
RHEL_BETA_REPO_CA=/etc/rhsm/ca/redhat-uep.pem
ENTITLEMENT_REPO_CA=${ENTITLEMENT_REPO_CA:-$RHEL_BETA_REPO_CA}
echo "INFO: Using $ENTITLEMENT_REPO_CA as RHEL-beta repo CA"

REPO_CA="--ca $ENTITLEMENT_REPO_CA"
else
REPO_CA=""
fi

ENTITLEMENT_RESOURCES=${ENTITLEMENT_RESOURCES:-/var/run/psap-entitlement-secret/01-cluster-wide-machineconfigs.yaml}
if [ "$entitlement_deployed" == 1 ]; then
# entitlement already deployed
Expand All @@ -53,7 +64,7 @@ else
ENTITLEMENT_KEY=/tmp/key.pem
extract_entitlement_key $ENTITLEMENT_RESOURCES $ENTITLEMENT_KEY

toolbox/entitlement/deploy.sh --pem "${ENTITLEMENT_KEY}"
toolbox/entitlement/deploy.sh --pem "${ENTITLEMENT_KEY}" $REPO_CA
entitlement_deployed=1
fi

Expand All @@ -65,4 +76,4 @@ fi
if ! toolbox/entitlement/wait.sh; then
echo "FATAL: Failed to properly entitle the cluster, cannot continue."
exit 1
fi
fi
7 changes: 7 additions & 0 deletions playbooks/gpu_operator_set_repo-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
- name: Set a custom repository list to use in the GPU Operator ClusterPolicy
hosts: localhost
connection: local
gather_facts: true
roles:
- role: gpu_operator_set_repo-config
2 changes: 2 additions & 0 deletions roles/entitlement_deploy/defaults/main/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
entitlement_resources: ''
# path to the PEM file of the entitlement
entitlement_pem: ''
# optional: path to a CA PEM key to authenticate a custom repo
entitlement_repo_ca: ''
17 changes: 17 additions & 0 deletions roles/entitlement_deploy/files/mc_rhsm_ca.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfig
metadata:
labels:
machineconfiguration.openshift.io/role: worker
name: 50-rhsm-repo-ca
spec:
config:
ignition:
version: 2.2.0
storage:
files:
- contents:
source: data:text/plain;charset=utf-8;base64,BASE64_ENCODED_RHSM_CA_FILE
filesystem: root
mode: 0644
path: /etc/rhsm/ca/custom-repo-ca.pem
13 changes: 11 additions & 2 deletions roles/entitlement_deploy/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
set -o pipefail;
python3 "{{ entitlement_py_apply }}"
"{{ entitlement_mc_rhsm }}" BASE64_ENCODED_RHSM_FILE "{{ entitlement_rhsm }}"
| oc create -f-
| oc apply -f-
when: entitlement_rhsm | default('', true) | trim != ''

- block:
Expand All @@ -22,5 +22,14 @@
set -o pipefail;
python3 "{{ entitlement_py_apply }}"
"{{ entitlement_mc_pem }}" BASE64_ENCODED_PEM_FILE "{{ entitlement_pem }}"
| oc create -f-
| oc apply -f-
when: entitlement_pem | default('', true) | trim != ''

- block:
- name: "Deploy the repo CA from file '{{ entitlement_mc_rhsm_ca }}'"
shell:
set -o pipefail;
python3 "{{ entitlement_py_apply }}"
"{{ entitlement_mc_rhsm_ca }}" BASE64_ENCODED_RHSM_CA_FILE "{{ entitlement_repo_ca }}"
| oc apply -f-
when: entitlement_repo_ca | default('', true) | trim != ''
1 change: 1 addition & 0 deletions roles/entitlement_deploy/vars/main/resources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ entitlement_rhsm: 'roles/entitlement_deploy/files/rhsm.conf'
# template files for the MachineConfig resources
entitlement_mc_rhsm: 'roles/entitlement_deploy/files/mc_rhsm.yml'
entitlement_mc_pem: 'roles/entitlement_deploy/files/mc_pem.yml'
entitlement_mc_rhsm_ca: 'roles/entitlement_deploy/files/mc_rhsm_ca.yml'
entitlement_py_apply: 'roles/entitlement_deploy/files/apply_template.py'
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,40 @@ data:
echo "# WARNING: entitlement files missing"
echo "#"
fi
echo

if [[ "${RHEL_VERSION}" == "8.4" ]]; then
echo
echo "# enable OCP 4.8 / RHEL 8.4 beta repository"
if ! md5sum /etc/rhsm-host/ca/custom-repo-ca.pem; then
echo "# WARNING: custom certificate for RHEL 8.4 beta repo not found"
fi
rm -rf /etc/yum.repos.d/*.repo
cat << EOF > /etc/yum.repos.d/dci-rpm-mirrors.repo
[rhel-8-beta-baseos-rpms]
name = Red Hat Enterprise Linux 8 Beta BaseOS (RPMs)
baseurl = https://mirror.openshift.com/enterprise/reposync/ci-deps/rhel-8-beta-baseos-rpms/
failovermethod = priority
gpgcheck = 0
sslclientcert = /etc/rhsm-host/ca/custom-repo-ca.pem
sslclientkey = /etc/rhsm-host/ca/custom-repo-ca.pem
sslverify = 0
enabled = 1

[rhel-8-beta-appstream-rpms]
name = Red Hat Enterprise Linux 8 Beta AppStream (RPMs)
baseurl = https://mirror.openshift.com/enterprise/reposync/ci-deps/rhel-8-beta-appstream-rpms/
failovermethod = priority
gpgcheck = 0
sslclientcert = /etc/rhsm-host/ca/custom-repo-ca.pem
sslclientkey = /etc/rhsm-host/ca/custom-repo-ca.pem
sslverify = 0
enabled = 1
EOF
else
echo "# INFO: no need to enable RHEL beta repositories on ${RHEL_VERSION}"
fi

echo
echo "# test EUS and OCP repositories (debug)"

echo "${RHEL_VERSION}" > /etc/yum/vars/releasever
Expand Down
10 changes: 10 additions & 0 deletions roles/gpu_operator_set_repo-config/defaults/main/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
# set to true to use RHEL beta repositories
gpu_operator_set_repo_use_rhel_beta: false
# path to the file to use a repo list in the Driver container
# (ignored if gpu_operator_set_repo_use_rhel_beta is true)
gpu_operator_set_repo_filename: ""


# path where the repo file will be stored in the driver container
gpu_operator_set_repo_destdir: "/etc/distro.repos.d"
Empty file.
3 changes: 3 additions & 0 deletions roles/gpu_operator_set_repo-config/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
dependencies:
- role: check_deps
93 changes: 93 additions & 0 deletions roles/gpu_operator_set_repo-config/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
---
- name: Find the name of the ClusterPolicy
block:
- name: Find the name of the ClusterPolicy
command: oc get ClusterPolicies -oname
register: cluster_policy_name
failed_when: cluster_policy_name.rc != 0 or not cluster_policy_name.stdout
rescue:
- name: Explain why we failed
fail: msg="Failed because no ClusterPolicy is available. Is the GPU Operator deployed?"

- name: Ensure that the DevicePlugin is not running
block:
- name: Get the number of DevicePlugin Pods ready
command:
oc get ds/nvidia-device-plugin-daemonset
-n gpu-operator-resources
-ojsonpath={.status.numberReady}
--ignore-not-found=true
register: device_plugin_pods_ready
failed_when: device_plugin_pods_ready.stdout and device_plugin_pods_ready.stdout != "0"
rescue:
- name: Explain why we failed
fail: msg="Failed because NVIDIA driver seems to be already loaded"

- name: Prepare RHEL-beta repo list file
when: gpu_operator_set_repo_use_rhel_beta | bool
block:
- name: Prepare RHEL 8.4 beta repo list file
copy:
# from https://downloads.redhat.com/redhat/rhel/rhel-8-beta/rhel-8-beta.repo
content: |
[rhel-8-beta-baseos-rpms]
name = Red Hat Enterprise Linux 8 Beta BaseOS (RPMs)
baseurl = https://mirror.openshift.com/enterprise/reposync/ci-deps/rhel-8-beta-baseos-rpms/
failovermethod = priority
gpgcheck = 0
sslclientcert = /etc/rhsm-host/ca/custom-repo-ca.pem
sslclientkey = /etc/rhsm-host/ca/custom-repo-ca.pem
sslverify = 0
enabled = 1

[rhel-8-beta-appstream-rpms]
name = Red Hat Enterprise Linux 8 Beta AppStream (RPMs)
baseurl = https://mirror.openshift.com/enterprise/reposync/ci-deps/rhel-8-beta-appstream-rpms/
failovermethod = priority
gpgcheck = 0
sslclientcert = /etc/rhsm-host/ca/custom-repo-ca.pem
sslclientkey = /etc/rhsm-host/ca/custom-repo-ca.pem
sslverify = 0
enabled = 1

dest: "{{ artifact_extra_logs_dir }}/gpu-operator.dnf.repo"
mode: 0644

- name: Prepare the user-provided repo list file
when: not gpu_operator_set_repo_use_rhel_beta | bool
block:
- name: Prepare the user-provided repo list file ({{ gpu_operator_set_repo_filename }})
copy:
src: "{{ gpu_operator_set_repo_filename }}"
dest: "{{ artifact_extra_logs_dir }}/gpu-operator.dnf.repo"
mode: 0644

- name: Delete the ConfigMap if it existed
command:
oc delete configmap repo-config
-n gpu-operator-resources
--ignore-not-found=true

- name: Create a ConfigMap with the repo list
command:
oc create configmap repo-config
-n gpu-operator-resources
--from-file "{{ artifact_extra_logs_dir }}/gpu-operator.dnf.repo"

- name: Patch the ClusterPolicy spec.driver.repoConfig
command: |
oc patch {{ cluster_policy_name.stdout }}
--type merge
--patch
'{"spec":{"driver":{"repoConfig": {"configMapName":"repo-config", "destinationDir": "{{ gpu_operator_set_repo_destdir }}" }} }}'

- name: Store the ClusterPolicy YAML definition
shell:
oc get {{ cluster_policy_name.stdout }} -oyaml
> {{ artifact_extra_logs_dir }}/ClusterPolicy.yml

- name: Delete the driver DaemonSet for the operator to recreate it
command:
oc delete ds/nvidia-driver-daemonset
-n gpu-operator-resources
--ignore-not-found=true
Empty file.
15 changes: 10 additions & 5 deletions toolbox/entitlement/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,30 @@ CURR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source ${CURR_DIR}/../_common.sh

usage() {
echo "Usage: $0 (--pem|--machine-configs) </path/to/file>"
echo "Usage: $0 (--pem|--machine-configs) </path/to/file> [--ca </path/to/ca>]"
}

if ! [ "$#" -eq 2 ]; then
echo "ERROR: please pass two arguments."
if ! [[ "$#" -eq 2 || "$#" -eq 4 ]]; then
echo "ERROR: please pass two or four arguments."
usage
exit 1
fi

if [[ "$1" == "--pem" ]]; then
ANSIBLE_OPTS="${ANSIBLE_OPTS} -e entitlement_pem=$2"
ANSIBLE_OPTS="${ANSIBLE_OPTS} -e entitlement_pem=$(realpath $2)"
echo "Using '$2' as PEM key"
elif [[ "$1" == "--machine-configs" ]]; then
ANSIBLE_OPTS="${ANSIBLE_OPTS} -e entitlement_resources=$2"
ANSIBLE_OPTS="${ANSIBLE_OPTS} -e entitlement_resources=$(realpath $2)"
echo "Using '$2' as entitlement resources"
else
echo "ERROR: please pass a valid flag."
usage
exit 1
fi

if [[ "${3:-}" == "--ca" ]]; then
ANSIBLE_OPTS="${ANSIBLE_OPTS} -e entitlement_repo_ca=$(realpath $4)"
echo "Using '$4' as repo CA"
fi

exec ansible-playbook ${ANSIBLE_OPTS} playbooks/entitlement_deploy.yml
37 changes: 37 additions & 0 deletions toolbox/gpu-operator/set_repo-config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#! /bin/bash -e

THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

RHEL_FLAG="--rhel-beta"

usage() {
echo "Usage: $0 <repo file abs path|${RHEL_FLAG}>"
}

REPOFILE_USE_RHEL="false"
REPOFILE_FILENAME=""

if [ "$#" -gt 1 ]; then
echo "FATAL: expected 1 parameter ... (got $#: '$@')"
usage
exit 1
elif [[ "$1" == "${RHEL_FLAG}" ]]; then
REPOFILE_USE_RHEL="true"
elif [[ "$1" == "--"* && "$1" != "${RHEL_FLAG}" ]]; then
echo "FATAL: only ${RHEL_FLAG} flag is allowed"
usage
exit 1
elif [[ ! -e "$1" ]]; then
echo "FATAL: File '$1' not found"
usage
exit 1
else
REPOFILE_FILENAME=$(realpath $1)
fi

source ${THIS_DIR}/../_common.sh

ANSIBLE_OPTS="${ANSIBLE_OPTS} -e gpu_operator_set_repo_filename=${REPOFILE_FILENAME}"
ANSIBLE_OPTS="${ANSIBLE_OPTS} -e gpu_operator_set_repo_use_rhel_beta=${REPOFILE_USE_RHEL}"

exec ansible-playbook ${ANSIBLE_OPTS} playbooks/gpu_operator_set_repo-config.yml