Skip to content
This repository has been archived by the owner on Aug 16, 2023. It is now read-only.

Commit

Permalink
Merge pull request #400 from kpouget/update
Browse files Browse the repository at this point in the history
RHODS: update RHODS version + various updates
  • Loading branch information
kpouget authored Jun 30, 2022
2 parents bec62f7 + a5ca8ef commit d9ed58a
Show file tree
Hide file tree
Showing 17 changed files with 69 additions and 22 deletions.
2 changes: 1 addition & 1 deletion config/ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ remote_user = root
roles_path = roles/
gathering = smart
fact_caching = yaml
fact_caching_timeout = 600
fact_caching_timeout = 0
callbacks_enabled = json_to_logfile, timer, profile_roles
inventory_ignore_extensions = secrets.py, .pyc, .cfg, .crt, .ini
# work around privilege escalation timeouts in ansible:
Expand Down
3 changes: 2 additions & 1 deletion roles/cluster_prometheus_db/defaults/main/config.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
---
cluster_prometheus_db_mode: # dump or reset
cluster_prometheus_db_label: app.kubernetes.io/component=prometheus
cluster_prometheus_db_namespace: openshift-monitoring
cluster_prometheus_db_namespace: openshift-monitoring
cluster_prometheus_db_directory: /prometheus
2 changes: 1 addition & 1 deletion roles/cluster_prometheus_db/tasks/dump.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

- name: Extract Prometheus database from the Pod
shell:
oc exec -c prometheus -n "{{ cluster_prometheus_db_namespace }}" "{{ prometheus_pod_name_cmd.stdout }}" -- tar cvzf - -C /prometheus . > "{{ artifact_extra_logs_dir }}/prometheus.tar.gz"
oc exec -c prometheus -n "{{ cluster_prometheus_db_namespace }}" "{{ prometheus_pod_name_cmd.stdout }}" -- tar cvzf - -C "{{ cluster_prometheus_db_directory }}" . > "{{ artifact_extra_logs_dir }}/prometheus.tar.gz"
register: extract_prometheus_db
# for tar, 0 means OK, 1 means file changed as we read it (we ignore it, 'Prometheus updates files atomically')
failed_when: extract_prometheus_db.rc > 1
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Can Login to Jupyterhub
Can Spawn Notebook
[Tags] Notebook
Fix Spawner Status
Spawn Notebook With Arguments image=s2i-generic-data-science-notebook size=Small spawner_timeout=5 minutes
Spawn Notebook With Arguments image=s2i-generic-data-science-notebook size=Default spawner_timeout=5 minutes

Git Clone the PSAP notebooks
[Tags] Notebook
Expand Down
8 changes: 7 additions & 1 deletion roles/rhods_test_jupyterlab/files/s3-artifacts-exporter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,13 @@ echo "$(date) '${ARTIFACTS_DIR}/test.exit_code' appeared."

set -x

if [[ "$ARTIFACTS_COLLECTED" == "no-image" ]]; then
test_failed=$(cat ${ARTIFACTS_DIR}/test.exit_code)

delete_image=0
[[ "$ARTIFACTS_COLLECTED" == "no-image" ]] && delete_image=1
[[ "$ARTIFACTS_COLLECTED" == "no-image-except-if-failed" && "$test_failed" == 0 ]] && delete_image=1

if [[ "$delete_image" == 1 ]]; then
find "${ARTIFACTS_DIR}" -name '*.png' -delete > dev/null
fi

Expand Down
27 changes: 22 additions & 5 deletions roles/rhods_test_jupyterlab/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,13 @@
oc delete ev -n {{ rhods_test_namespace }} --all
failed_when: false

- name: Delete the events of the notebook namespace
- name: Delete the events, pods and PVC of the notebook namespace
environment:
KUBECONFIG: '{{ sut_cluster_kubeconfig }}'
command:
oc delete ev -n {{ rhods_notebook_namespace }} --all
command: |
oc delete pod --all -n {{ rhods_notebook_namespace }}
oc delete pvc --all -n {{ rhods_notebook_namespace }}
oc delete ev --all -n {{ rhods_notebook_namespace }}
failed_when: false

- name: Name the namespace privileged
Expand Down Expand Up @@ -212,8 +214,8 @@
- name: Cleanup the notebooks Pods and PVCs
# (the Pods are destroyed anyway when the ods-ci test succeeds)
shell: |
oc delete pod --all -n rhods-notebooks
oc delete pvc --all -n rhods-notebooks
oc delete pod --all -n {{ rhods_notebook_namespace }}
oc delete pvc --all -n {{ rhods_notebook_namespace }}
environment:
KUBECONFIG: '{{ sut_cluster_kubeconfig }}'

Expand Down Expand Up @@ -313,13 +315,28 @@
set -o pipefail;
cat "{{ artifact_extra_logs_dir }}"/ods-ci/ods-ci-*/test.exit_code | grep '^0$' | wc -l
failed_when: false
register: success_count_cmd

- name: Count failed tests
shell:
set -o pipefail;
cat "{{ artifact_extra_logs_dir }}"/ods-ci/ods-ci-*/test.exit_code | grep -v '^0$' | wc -l
failed_when: false

- name: Show failed tests
shell:
grep -v '^0$' "{{ artifact_extra_logs_dir }}"/ods-ci/ods-ci-*/test.exit_code
failed_when: false
register: failed_tests_cmd

- name: Save the success count and failed tests
shell: |
echo "{{ success_count_cmd.stdout }}/{{ rhods_test_jupyterlab_user_count }}" \
> "{{ artifact_extra_logs_dir }}/success_count"
echo "{{ failed_tests_cmd.stdout }}" \
> "{{ artifact_extra_logs_dir }}/failed_tests"
failed_when: false

- name: Test if the RHODS test job crashed
command:
oc get jobs/ods-ci -ojsonpath={.status.failed} -n {{ rhods_test_namespace }}
Expand Down
4 changes: 2 additions & 2 deletions roles/rhods_test_jupyterlab/templates/rhods-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ spec:
mountPath: /mnt/rhods-jupyterlab-entrypoint
resources:
requests:
memory: 400M
memory: 750M
cpu: 0.2
limits:
memory: 500M
memory: 750M
cpu: 0.2
- image: "quay.io/centos/centos:stream8"
name: artifacts-exporter
Expand Down
2 changes: 2 additions & 0 deletions subprojects/deploy-cluster/config_install.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-install-linux.tar.gz"
# https://mirror.openshift.com/pub/openshift-v4/x86_64/clients/ocp-dev-preview/pre-release/openshift-install-linux.tar.gz"

SHELL=/usr/bin/env bash -o pipefail

has_installer: ${OPENSHIFT_INSTALLER}
${OPENSHIFT_INSTALLER}:
@echo "WARNING: Installer v${OCP_VERSION} not found: ${OPENSHIFT_INSTALLER}"
Expand Down
1 change: 1 addition & 0 deletions testing/ods/clusters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
set -o pipefail
set -o errexit
set -o nounset
set -o errtrace
set -x

THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
Expand Down
2 changes: 1 addition & 1 deletion testing/ods/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ S3_LDAP_PROPS="${PSAP_ODS_SECRET_PATH}/s3_ldap.passwords"
OSD_USE_ODS_CATALOG=${OSD_USE_ODS_CATALOG:-0}

ODS_QE_CATALOG_IMAGE="quay.io/modh/qe-catalog-source"
ODS_QE_CATALOG_IMAGE_TAG="v1100-6"
ODS_QE_CATALOG_IMAGE_TAG="v1121-1"

ODS_CI_TEST_NAMESPACE=loadtest
ODS_CI_REPO="https://github.com/openshift-psap/ods-ci.git"
Expand Down
4 changes: 3 additions & 1 deletion testing/ods/jh-at-scale.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
set -o errexit
set -o pipefail
set -o nounset
set -o errtrace
set -x

THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
Expand Down Expand Up @@ -202,7 +203,7 @@ run_multi_cluster() {
if [[ "$ODS_CI_NB_USERS" -le 5 ]]; then
collect=all
else
collect=no-image
collect=no-image-except-if-failed
fi

./run_toolbox.py rhods test_jupyterlab \
Expand All @@ -212,6 +213,7 @@ run_multi_cluster() {
--sut_cluster_kubeconfig="$KUBECONFIG_SUTEST" \
--artifacts-collected=$collect

set +e # we do not wait to fail passed this point

dump_prometheus_dbs
}
Expand Down
6 changes: 4 additions & 2 deletions testing/ods/ocp_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
set -o pipefail
set -o errexit
set -o nounset
set -o errtrace
set -x

THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source "$THIS_DIR/common.sh"
Expand Down Expand Up @@ -75,8 +77,8 @@ create_cluster() {
fi
}

# ensure that the cluster's 'metadata.json' is always copied to the SHARED_DIR
trap save_install_artifacts EXIT
# ensure that the cluster's 'metadata.json' is copied to the SHARED_DIR even in case of errors
trap save_install_artifacts EXIT SIGTERM SIGINT

make cluster \
OCP_VERSION="${OCP_VERSION}" \
Expand Down
2 changes: 1 addition & 1 deletion testing/ods/osd_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -o pipefail
set -o errexit
set -o nounset

set -o errtrace

THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source "$THIS_DIR/common.sh"
Expand Down
6 changes: 4 additions & 2 deletions testing/ods/process_ctrl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ process_ctrl::run_in_bg() {
process_ctrl::wait_bg_processes() {
echo "Waiting for the background processes '${process_ctrl__wait_list[@]}' to terminate ..."
for pid in ${process_ctrl__wait_list[@]}; do
if ! wait $pid # this syntax honors the `set -e` flag
retcode=0
wait $pid || retcode=$? # this syntax honors the `set -e` flag
if [[ "$retcode" != "0" ]];
then
echo "Process $pid failed :( retcode=$?"
echo "Process $pid failed :( retcode=$retcode"
false
fi
done
Expand Down
2 changes: 1 addition & 1 deletion testing/ods/sizing/notebook_sizes
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# name, cpu (float), memory (float, in GB)
test_pod, cpu=0.2, memory=0.4
test_pod, cpu=0.2, memory=0.75
default, cpu=1, memory=4
small, cpu=1, memory=8
medium, cpu=3, memory=24
8 changes: 8 additions & 0 deletions toolbox/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def run_ansible_role(role_name, opts: dict = dict()):
sys.stdout.flush()
sys.stderr.flush()

ret = -1
try:
run_result = subprocess.run(cmd, env=env, check=False)
ret = run_result.returncode
Expand All @@ -141,4 +142,11 @@ def run_ansible_role(role_name, opts: dict = dict()):
except FileNotFoundError:
pass # play file was removed, ignore

with open(artifact_extra_logs_dir / "exit_code", "w") as f:
print(f"{ret}", file=f)

if ret != 0:
with open(artifact_extra_logs_dir / "FAILURE", "w") as f:
print(f"{ret}", file=f)

raise SystemExit(ret)
10 changes: 8 additions & 2 deletions toolbox/rhods.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ def test_jupyterlab(idp_name, username_prefix, user_count: int,
user_count: Number of users to run in parallel
secret_properties_file: Path of a file containing the properties of LDAP secrets. (See 'deploy_ldap' command)
sut_cluster_kubeconfig: Optional. Path of the system-under-test cluster's Kubeconfig. If provided, the RHODS endpoints will be looked up in this cluster.
artifacts_collected: Optional. 'all': collect all the artifacts generated by ODS-CI. 'no-image': exclude the images (.png) from the artifacts collected. 'none': do not collect any ODS-CI artifact. Default 'all'.
artifacts_collected: Optional. Default 'all'.
- 'all': collect all the artifacts generated by ODS-CI.
- 'no-image': exclude the images (.png) from the artifacts collected.
- 'no-image-except-if-failed': exclude the images, except if the test failed.
- 'none': do not collect any ODS-CI artifact.
ods_ci_test_case: Optional. ODS-CI test case to execute.
ods_ci_exclude_tags: Optional. Tags to exclude in the ODS-CI test case.
"""
Expand All @@ -88,7 +92,7 @@ def test_jupyterlab(idp_name, username_prefix, user_count: int,
"rhods_test_jupyterlab_ods_ci_exclude_tags": ods_ci_exclude_tags
}

ARTIFACTS_COLLECTED_VALUES = ("all", "none", "no-image")
ARTIFACTS_COLLECTED_VALUES = ("all", "none", "no-image", "no-image-except-if-failed")
if artifacts_collected not in ARTIFACTS_COLLECTED_VALUES:
print(f"ERROR: invalid value '{artifacts_collected}' for 'artifacts_collected'. Must be one of {', '.join(ARTIFACTS_COLLECTED_VALUES)}")
sys.exit(1)
Expand Down Expand Up @@ -181,6 +185,8 @@ def dump_prometheus_db():
"cluster_prometheus_db_mode": "dump",
"cluster_prometheus_db_label": "deployment=prometheus",
"cluster_prometheus_db_namespace": "redhat-ods-monitoring",
"cluster_prometheus_db_directory": "/prometheus/data",

}

return RunAnsibleRole("cluster_prometheus_db", opts)
Expand Down

0 comments on commit d9ed58a

Please sign in to comment.