Skip to content

Commit

Permalink
Add uats CI for bundle onm EKS (#276)
Browse files Browse the repository at this point in the history
* Add tests for running UATs

---------

Co-authored-by: Noha Ihab <49988746+NohaIhab@users.noreply.github.com>
  • Loading branch information
misohu and NohaIhab authored Sep 30, 2024
1 parent 7b21655 commit 4b6f002
Show file tree
Hide file tree
Showing 6 changed files with 294 additions and 50 deletions.
12 changes: 5 additions & 7 deletions .github/cluster.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: eksctl.io/v1alpha5
availabilityZones:
- {{ region }}a
- {{ region }}b
- eu-central-1a
- eu-central-1b
cloudWatch:
clusterLogging: {}
iam:
Expand All @@ -10,8 +10,6 @@ iam:
addons:
- name: aws-ebs-csi-driver
serviceAccountRoleARN: "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
wellKnownPolicies:
ebsCSIController: true
kind: ClusterConfig
kubernetesNetworkConfig:
ipFamily: IPv4
Expand All @@ -35,6 +33,6 @@ managedNodeGroups:
alpha.eksctl.io/nodegroup-type: managed
volumeSize: 100
metadata:
name: mlflow-bundle-test
region: {{ region }}
version: "1.24"
name: mlflow-test
region: eu-central-1
version: "1.26"
10 changes: 10 additions & 0 deletions .github/dependencies.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"2.15":
K8S_VERSION: "1.29"
JUJU_VERSION: "3.4"
JUJU_VERSION_WITH_PATCH: "3.4.4"
UATS_BRANCH: "main"
latest:
K8S_VERSION: "1.29"
JUJU_VERSION: "3.4"
JUJU_VERSION_WITH_PATCH: "3.4.4"
UATS_BRANCH: "main"
158 changes: 115 additions & 43 deletions .github/workflows/deploy-eks.yaml
Original file line number Diff line number Diff line change
@@ -1,44 +1,97 @@
name: Create EKS cluster, deploy MLflow and run bundle test
name: Create EKS cluster, deploy CKF and MLflow and run MLflow bundle UATs
on:
workflow_dispatch: # This event allows manual triggering from the Github UI
secrets:
BUNDLE_KUBEFLOW_EKS_AWS_ACCESS_KEY_ID:
required: true
BUNDLE_KUBEFLOW_EKS_AWS_SECRET_ACCESS_KEY:
required: true
inputs:
region:
description: 'Insert the AWS Region name in which the script will deploy the EKS cluster.'
bundle_version:
description: 'Comma-separated list of bundle versions e.g. 2.15, latest. Make sure that the corresponding K8s version is supported by the cloud.'
default: '2.15, latest'
required: true
k8s_version:
description: 'Kubernetes version to be used for the EKS cluster'
required: false
uats_branch:
description: 'Branch to run the UATs from e.g. main or track/1.9. By default, this is defined by the dependencies.yaml file.'
required: false
default: 'eu-central-1'
type: string
schedule:
- cron: "23 0 * * 2"
- cron: "23 0 * * 4"
jobs:
deploy-mlflow-to-eks:
preprocess-input:
runs-on: ubuntu-22.04
outputs:
processed_bundle_versions: ${{ steps.process_bundle_versions.outputs.bundle_versions }}
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'

- name: Process bundle versions
id: process_bundle_versions
run: python scripts/gh-actions/parse_versions.py

deploy-ckf-to-eks:
needs: preprocess-input
runs-on: ubuntu-22.04
strategy:
matrix:
bundle_version: ${{ fromJSON(needs.preprocess-input.outputs.processed_bundle_versions) }}
fail-fast: false
env:
PYTHON_VERSION: "3.8"

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Install tox
- name: Run YAML to Github Output Action
id: yaml-output
uses: christian-ci/action-yaml-github-output@v2
with:
file_path: ".github/dependencies.yaml"
main_key: ${{ matrix.bundle_version }}

- name: Update ENV variables from inputs if available
run: |
python -m pip install --upgrade pip
pip install tox
K8S_VERSION=${{ inputs.k8s_version || env.K8S_VERSION }}
echo "K8S_VERSION=${K8S_VERSION}" >> $GITHUB_ENV
UATS_BRANCH=${{ inputs.uats_branch || env.UATS_BRANCH }}
echo "UATS_BRANCH=${UATS_BRANCH}" >> $GITHUB_ENV
# Remove once https://github.com/canonical/bundle-kubeflow/issues/761
# is resolved and applied to uats repository.
- name: Install python ${{ env.PYTHON_VERSION }}
run: |
sudo add-apt-repository ppa:deadsnakes/ppa -y
sudo apt update -y
sudo apt install python${{ env.PYTHON_VERSION }} python${{ env.PYTHON_VERSION }}-distutils python${{ env.PYTHON_VERSION }}-venv -y
- name: Install CLI tools
run: |
wget https://bootstrap.pypa.io/get-pip.py
python${{ env.PYTHON_VERSION }} get-pip.py
python${{ env.PYTHON_VERSION }} -m pip install tox
sudo snap install charmcraft --classic
# We need to install from binary because of this https://bugs.launchpad.net/juju/+bug/2007575
curl -LO https://launchpad.net/juju/${{ env.JUJU_VERSION }}/${{ env.JUJU_VERSION_WITH_PATCH }}/+download/juju-${{ env.JUJU_VERSION_WITH_PATCH }}-linux-amd64.tar.xz
tar xf juju-${{ env.JUJU_VERSION_WITH_PATCH }}-linux-amd64.tar.xz
sudo install -o root -g root -m 0755 juju /usr/local/bin/juju
juju version
- name: Configure AWS Credentials
env:
AWS_ACCESS_KEY_ID: ${{ secrets.BUNDLE_KUBEFLOW_EKS_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.BUNDLE_KUBEFLOW_EKS_AWS_SECRET_ACCESS_KEY }}
run: |
aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID
aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY
aws configure set default.region ${{ inputs.region }}
aws configure set default.region eu-central-1
- name: Install kubectl
run: |
sudo snap install kubectl --classic --channel=1.24/stable
sudo snap install kubectl --classic --channel=${{ env.K8S_VERSION }}/stable
mkdir ~/.kube
kubectl version --client
Expand All @@ -50,54 +103,73 @@ jobs:
sudo mv /tmp/eksctl /usr/local/bin
eksctl version
- name: Install juju
run: |
sudo snap install juju --classic --channel=2.9/stable
sudo snap install charmcraft --classic
juju version
- name: Create cluster
run: |
sed -i "s/{{ region }}/${{ inputs.region }}/" .github/cluster.yaml
VERSION=${{ matrix.bundle_version }}
VERSION_WITHOUT_DOT="${VERSION//.}"
yq e ".metadata.name |= \"mlflow-test-$VERSION_WITHOUT_DOT\"" -i .github/cluster.yaml
yq e ".metadata.version |= \"${{ env.K8S_VERSION }}\"" -i .github/cluster.yaml
eksctl create cluster -f .github/cluster.yaml
kubectl get nodes
- name: Setup juju
run: |
juju add-k8s kubeflow --client
juju bootstrap --no-gui kubeflow kubeflow-controller
juju add-k8s eks --client
juju bootstrap eks kubeflow-controller
juju add-model kubeflow
- name: Test bundle deployment
run: |
tox -vve test_bundle_deployment-${{ matrix.bundle_version }} -- --model kubeflow --keep-models -vv -s
- name: Run Kubeflow UATs
run: |
tox -vve bundle-test -- --model kubeflow --keep-models -vv -s
git clone https://github.com/canonical/charmed-kubeflow-uats.git ~/charmed-kubeflow-uats
cd ~/charmed-kubeflow-uats
git checkout ${{ env.UATS_BRANCH }}
tox -e mlflow-remote
# On failure, capture debugging resources
- name: Get all kubernetes resources
run: kubectl get all -A
if: failure()
- name: Save debug artifacts
uses: canonical/kubeflow-ci/actions/dump-charm-debug-artifacts@main
if: failure() || cancelled()

# On failure, capture debugging resources
- name: Get juju status
run: juju status
if: failure()
if: failure() || cancelled()

- name: Get juju debug logs
run: juju debug-log --replay --no-tail
if: failure() || cancelled()

- name: Get all kubernetes resources
run: kubectl get all -A
if: failure() || cancelled()

- name: Get logs from pods with status = Pending
run: kubectl -n kubeflow get pods | tail -n +2 | grep Pending | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()

- name: Get workload logs
run: kubectl logs --tail 100 -ntesting -lapp.kubernetes.io/name=mlflow-server
if: failure()
- name: Get logs from pods with status = Failed
run: kubectl -n kubeflow get pods | tail -n +2 | grep Failed | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()

- name: Get operator logs
run: kubectl logs --tail 100 -ntesting -loperator.juju.is/name=mlflow-server
if: failure()
- name: Get logs from pods with status = CrashLoopBackOff
run: kubectl -n kubeflow get pods | tail -n +2 | grep CrashLoopBackOff | awk '{print $1}' | xargs -n1 kubectl -n kubeflow logs --all-containers=true --tail 100
if: failure() || cancelled()

- name: Remove eks
- name: Delete EKS cluster
if: always()
run: |
eksctl delete cluster --name=mlflow-bundle-test
VERSION=${{ matrix.bundle_version }}
VERSION_WITHOUT_DOT="${VERSION//.}"
eksctl delete cluster --region eu-central-1 --name=mlflow-test-$VERSION_WITHOUT_DOT
delete-unattached-volumes:
if: always()
uses: ./.github/workflows/delete-aws-volumes.yaml
secrets: inherit
with:
region: ${{ inputs.region }}
needs: [deploy-mlflow-to-eks]
region: eu-central-1
needs: [deploy-ckf-to-eks]
25 changes: 25 additions & 0 deletions scripts/gh-actions/parse_versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
import sys
import json

# Parse the versions given as a comma-separated list and return a JSON array
def parse_versions(input_versions):
# Default version string if the input is empty
if not input_versions:
input_versions = "2.15,latest"
else:
# Remove whitespace between entries
input_versions = input_versions.replace(" ", "")

# Convert to JSON array
json_array = json.dumps(input_versions.split(","))
return json_array

if __name__ == "__main__":
# Read the input of the Github Action from the environment variable
input_versions = os.getenv('INPUT_BUNDLE_VERSION', '')
json_array = parse_versions(input_versions)
print(f"bundle_versions={json_array}")
with open(os.environ['GITHUB_OUTPUT'], 'a') as output_file:
output_file.write(f"bundle_versions={json_array}\n")

Loading

0 comments on commit 4b6f002

Please sign in to comment.