Skip to content

Commit

Permalink
Use KFP lite deployment for presubmit tests (#1808)
Browse files Browse the repository at this point in the history
* Refactor presubmit-tests-with-pipeline-deployment.sh so that it can be run from a different project

* Simplify getting service account from cluster.

* Migrate presubmit-tests-with-pipeline-deployment.sh to use kfp
lightweight deployment.

* Add option to cache built images to make debugging faster.

* Fix cluster set up

* Copy image builder image instead of granting permission

* Add missed yes command

* fix stuff

* Let other usages of image-builder image become configurable

* let test workflow use image builder image

* Fix permission issue

* Hide irrelevant error logs

* Use shared service account key instead

* Move test manifest to test folder

* Move build-images.sh to a different script file

* Update README.md

* add cluster info dump

* Use the same cluster resources as kubeflow deployment

* Remove cluster info dump

* Add timing to test log

* cleaned up code

* fix tests

* address cr comments

* Address cr comments

* Enable image caching to improve retest speed
  • Loading branch information
Bobgy authored and IronPan committed Aug 21, 2019
1 parent 0864faf commit d11fae7
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 48 deletions.
2 changes: 2 additions & 0 deletions test/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# temporary folder used in tests
bin
5 changes: 5 additions & 0 deletions test/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ Run the following commands from root of kubeflow/pipelines repo.
#$PULL_PULL_SHA and $WORKSPACE are env variables set by Prow
export PULL_PULL_SHA=pull-sha-placeholder
export WORKSPACE=$(pwd) # root of kubeflow/pipelines git repo
export SA_KEY_FILE=PATH/TO/YOUR/GCP/PROJECT/SERVICE/ACCOUNT/KEY
# (optional) uncomment the following to keep reusing the same cluster
# export TEST_CLUSTER=YOUR_PRECONFIGURED_CLUSTER_NAME
# (optional) uncomment the following to disable built image caching
# export DISABLE_IMAGE_CACHING=true
./test/presubmit-tests-with-pipeline-deployment.sh \
--workflow_file e2e_test_gke_v2.yaml \ # You can specify other workflows you want to test too.
Expand Down
59 changes: 59 additions & 0 deletions test/build-images.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -ex

IMAGE_BUILDER_ARG=""
if [ "$PROJECT" != "ml-pipeline-test" ]; then
COPIED_IMAGE_BUILDER_IMAGE=${GCR_IMAGE_BASE_DIR}/image-builder
echo "Copy image builder image to ${COPIED_IMAGE_BUILDER_IMAGE}"
yes | gcloud container images add-tag \
gcr.io/ml-pipeline-test/image-builder:v20181128-0.1.3-rc.1-109-ga5a14dc-e3b0c4 \
${COPIED_IMAGE_BUILDER_IMAGE}:latest
IMAGE_BUILDER_ARG="-p image-builder-image=${COPIED_IMAGE_BUILDER_IMAGE}"
fi

# Image caching can be turned off by setting $DISABLE_IMAGE_CACHING env flag.
# Note that GCR_IMAGE_BASE_DIR contains commit hash, so whenever there's a code
# change, we won't use caches for sure.
BUILT_IMAGES=$(gcloud container images list --repository=${GCR_IMAGE_BASE_DIR})
if
test -z "$DISABLE_IMAGE_CACHING" && \
echo "$BUILT_IMAGES" | grep api-server && \
echo "$BUILT_IMAGES" | grep frontend && \
echo "$BUILT_IMAGES" | grep scheduledworkflow && \
echo "$BUILT_IMAGES" | grep persistenceagent;
then
echo "docker images for api-server, frontend, scheduledworkflow and \
persistenceagent are already built in ${GCR_IMAGE_BASE_DIR}."
else
echo "submitting argo workflow to build docker images for commit ${PULL_PULL_SHA}..."
# Build Images
ARGO_WORKFLOW=`argo submit ${DIR}/build_image.yaml \
-p image-build-context-gcs-uri="$remote_code_archive_uri" \
${IMAGE_BUILDER_ARG} \
-p api-image="${GCR_IMAGE_BASE_DIR}/api-server" \
-p frontend-image="${GCR_IMAGE_BASE_DIR}/frontend" \
-p scheduledworkflow-image="${GCR_IMAGE_BASE_DIR}/scheduledworkflow" \
-p persistenceagent-image="${GCR_IMAGE_BASE_DIR}/persistenceagent" \
-n ${NAMESPACE} \
--serviceaccount test-runner \
-o name
`
echo "build docker images workflow submitted successfully"
source "${DIR}/check-argo-status.sh"
echo "build docker images workflow completed"
fi
5 changes: 3 additions & 2 deletions test/check-argo-status.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ echo "check status of argo workflow $ARGO_WORKFLOW...."
# probing the argo workflow status until it completed. Timeout after 30 minutes
for i in $(seq 1 ${PULL_ARGO_WORKFLOW_STATUS_MAX_ATTEMPT})
do
WORKFLOW_STATUS=`kubectl get workflow $ARGO_WORKFLOW -n ${NAMESPACE} --show-labels`
WORKFLOW_STATUS=`kubectl get workflow $ARGO_WORKFLOW -n ${NAMESPACE} --show-labels 2>&1` \
|| echo kubectl get workflow failed with "$WORKFLOW_STATUS" # Tolerate temporary network failure during kubectl get workflow
echo $WORKFLOW_STATUS | grep ${WORKFLOW_COMPLETE_KEYWORD} && s=0 && break || s=$? && printf "Workflow ${ARGO_WORKFLOW} is not finished.\n${WORKFLOW_STATUS}\nSleep for 20 seconds...\n" && sleep 20
done

Expand Down Expand Up @@ -54,4 +55,4 @@ if [[ $WORKFLOW_STATUS = *"${WORKFLOW_FAILED_KEYWORD}"* ]]; then
exit 1
else
argo get ${ARGO_WORKFLOW} -n ${NAMESPACE}
fi
fi
73 changes: 73 additions & 0 deletions test/deploy-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -ex

# Specify TEST_CLUSTER env variable to use an existing cluster.
TEST_CLUSTER_PREFIX=${WORKFLOW_FILE%.*}
TEST_CLUSTER_DEFAULT=$(echo $TEST_CLUSTER_PREFIX | cut -d _ -f 1)-${PULL_PULL_SHA:0:7}-${RANDOM}
TEST_CLUSTER=${TEST_CLUSTER:-${TEST_CLUSTER_DEFAULT}}
SHOULD_CLEANUP_CLUSTER=false

function clean_up {
set +e # the following clean up commands shouldn't exit on error

echo "Status of pods before clean up:"
kubectl get pods --all-namespaces

echo "Clean up..."
if [ $SHOULD_CLEANUP_CLUSTER == true ]; then
# --async doesn't wait for this operation to complete, so we can get test
# results faster
yes | gcloud container clusters delete ${TEST_CLUSTER} --async
fi
}
trap clean_up EXIT SIGINT SIGTERM

cd ${DIR}
# test if ${TEST_CLUSTER} exists or not
if gcloud container clusters describe ${TEST_CLUSTER} &>/dev/null; then
echo "Use existing test cluster: ${TEST_CLUSTER}"
else
echo "Creating a new test cluster: ${TEST_CLUSTER}"
SHOULD_CLEANUP_CLUSTER=true
# "storage-rw" is needed to allow VMs to push to gcr.io
# reference: https://cloud.google.com/compute/docs/access/service-accounts#accesscopesiam
SCOPE_ARG="--scopes=storage-rw"
# Machine type and cluster size is the same as kubeflow deployment to
# easily compare performance. We can reduce usage later.
NODE_POOL_CONFIG_ARG="--num-nodes=2 --machine-type=n1-standard-8 \
--enable-autoscaling --max-nodes=8 --min-nodes=2"
gcloud container clusters create ${TEST_CLUSTER} ${SCOPE_ARG} ${NODE_POOL_CONFIG_ARG}
fi

gcloud container clusters get-credentials ${TEST_CLUSTER}

# when we reuse a cluster when debugging, clean up its kfp installation first
# this does nothing with a new cluster
kubectl delete namespace ${NAMESPACE} --wait || echo "No need to delete ${NAMESPACE} namespace. It doesn't exist."
kubectl create namespace ${NAMESPACE} --dry-run -o yaml | kubectl apply -f -

if [ -z $SA_KEY_FILE ]; then
SA_KEY_FILE=${DIR}/key.json
# The service account key is for default VM service account.
# ref: https://cloud.google.com/compute/docs/access/service-accounts#compute_engine_default_service_account
# It was generated by the following command
# `gcloud iam service-accounts keys create $SA_KEY_FILE --iam-account ${VM_SERVICE_ACCOUNT}`
# Because there's a limit of 10 keys per service account, we are reusing the same key stored in the following bucket.
gsutil cp "gs://ml-pipeline-test-keys/ml-pipeline-test-sa-key.json" $SA_KEY_FILE
fi
kubectl create secret -n ${NAMESPACE} generic user-gcp-sa --from-file=user-gcp-sa.json=$SA_KEY_FILE --dry-run -o yaml | kubectl apply -f -
47 changes: 47 additions & 0 deletions test/deploy-pipeline-lite.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -ex

if ! which kustomize; then
# Download kustomize cli tool
TOOL_DIR=${DIR}/bin
mkdir -p ${TOOL_DIR}
wget https://github.com/kubernetes-sigs/kustomize/releases/download/v3.1.0/kustomize_3.1.0_linux_amd64 -O ${TOOL_DIR}/kustomize
chmod +x ${TOOL_DIR}/kustomize
PATH=${PATH}:${TOOL_DIR}
fi

# delete argo first because KFP comes with argo too
kubectl delete namespace argo --wait || echo "No argo installed"

KFP_MANIFEST_DIR=${DIR}/manifests
pushd ${KFP_MANIFEST_DIR}

# This is the recommended approach to do this.
# reference: https://github.com/kubernetes-sigs/kustomize/blob/master/docs/eschewedFeatures.md#build-time-side-effects-from-cli-args-or-env-variables
kustomize edit set image gcr.io/ml-pipeline/api-server=${GCR_IMAGE_BASE_DIR}/api-server:latest
kustomize edit set image gcr.io/ml-pipeline/persistenceagent=${GCR_IMAGE_BASE_DIR}/persistenceagent:latest
kustomize edit set image gcr.io/ml-pipeline/scheduledworkflow=${GCR_IMAGE_BASE_DIR}/scheduledworkflow:latest
kustomize edit set image gcr.io/ml-pipeline/frontend=${GCR_IMAGE_BASE_DIR}/frontend:latest
cat kustomization.yaml

kustomize build . | kubectl apply -f -
# show current info
echo "Status of pods after kubectl apply"
kubectl get pods -n ${NAMESPACE}

popd
27 changes: 16 additions & 11 deletions test/install-argo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,25 @@ set -ex
kubectl config set-context $(kubectl config current-context) --namespace=default
echo "Add necessary cluster role bindings"
ACCOUNT=$(gcloud info --format='value(config.account)')
kubectl create clusterrolebinding PROW_BINDING --clusterrole=cluster-admin --user=$ACCOUNT
kubectl create clusterrolebinding DEFAULT_BINDING --clusterrole=cluster-admin --serviceaccount=default:default
kubectl create clusterrolebinding PROW_BINDING --clusterrole=cluster-admin --user=$ACCOUNT --dry-run -o yaml | kubectl apply -f -
kubectl create clusterrolebinding DEFAULT_BINDING --clusterrole=cluster-admin --serviceaccount=default:default --dry-run -o yaml | kubectl apply -f -

echo "install argo"
ARGO_VERSION=v2.3.0
mkdir -p ~/bin/
export PATH=~/bin/:$PATH
curl -sSL -o ~/bin/argo https://github.com/argoproj/argo/releases/download/$ARGO_VERSION/argo-linux-amd64
chmod +x ~/bin/argo
#kubectl create ns argo
#kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/$ARGO_VERSION/manifests/install.yaml

# if argo is not installed
if ! which argo; then
echo "install argo"
mkdir -p ~/bin/
export PATH=~/bin/:$PATH
curl -sSL -o ~/bin/argo https://github.com/argoproj/argo/releases/download/$ARGO_VERSION/argo-linux-amd64
chmod +x ~/bin/argo
fi

kubectl create ns argo --dry-run -o yaml | kubectl apply -f -
kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/$ARGO_VERSION/manifests/install.yaml

# Some workflows are deployed to the non-default namespace where the GCP credential secret is stored
# In this case, the default service account in that namespace doesn't have enough permission
echo "add service account for running the test workflow"
kubectl create serviceaccount test-runner -n ${NAMESPACE}
kubectl create clusterrolebinding test-admin-binding --clusterrole=cluster-admin --serviceaccount=${NAMESPACE}:test-runner
kubectl create serviceaccount test-runner -n ${NAMESPACE} --dry-run -o yaml | kubectl apply -f -
kubectl create clusterrolebinding test-admin-binding --clusterrole=cluster-admin --serviceaccount=${NAMESPACE}:test-runner --dry-run -o yaml | kubectl apply -f -
7 changes: 7 additions & 0 deletions test/manifests/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

# Actual image overrides will be added in test scripts.
images: []
resources:
- ../../manifests/kustomize/namespaced-install
49 changes: 14 additions & 35 deletions test/presubmit-tests-with-pipeline-deployment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,45 +68,25 @@ GCR_IMAGE_BASE_DIR=gcr.io/${PROJECT}/${PULL_PULL_SHA}
TEST_RESULTS_GCS_DIR=gs://${TEST_RESULT_BUCKET}/${PULL_PULL_SHA}/${TEST_RESULT_FOLDER}
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"

# Configure `time` command output format.
TIMEFORMAT="[test-timing] It took %lR."

echo "presubmit test starts"
source "${DIR}/test-prep.sh"
time source "${DIR}/test-prep.sh"
echo "test env prepared"

# Deploy Kubeflow
source "${DIR}/deploy-kubeflow.sh"
time source "${DIR}/deploy-cluster.sh"
echo "cluster deployed"

# Install Argo CLI and test-runner service account
source "${DIR}/install-argo.sh"
time source "${DIR}/install-argo.sh"
echo "argo installed"

IMAGE_BUILDER_ARG=""
# When project is not ml-pipeline-test, VMs need permission to fetch some images in gcr.io/ml-pipeline-test.
if [ "$PROJECT" != "ml-pipeline-test" ]; then
COPIED_IMAGE_BUILDER_IMAGE=${GCR_IMAGE_BASE_DIR}/image-builder
echo "Copy image builder image to ${COPIED_IMAGE_BUILDER_IMAGE}"
yes | gcloud container images add-tag \
gcr.io/ml-pipeline-test/image-builder:v20181128-0.1.3-rc.1-109-ga5a14dc-e3b0c4 \
${COPIED_IMAGE_BUILDER_IMAGE}:latest
IMAGE_BUILDER_ARG="-p image-builder-image=${COPIED_IMAGE_BUILDER_IMAGE}"
fi
time source "${DIR}/build-images.sh"
echo "KFP images built"

# Build Images
echo "submitting argo workflow to build docker images for commit ${PULL_PULL_SHA}..."
ARGO_WORKFLOW=`argo submit ${DIR}/build_image.yaml \
-p image-build-context-gcs-uri="$remote_code_archive_uri" \
${IMAGE_BUILDER_ARG} \
-p api-image="${GCR_IMAGE_BASE_DIR}/api-server" \
-p frontend-image="${GCR_IMAGE_BASE_DIR}/frontend" \
-p scheduledworkflow-image="${GCR_IMAGE_BASE_DIR}/scheduledworkflow" \
-p persistenceagent-image="${GCR_IMAGE_BASE_DIR}/persistenceagent" \
-n ${NAMESPACE} \
--serviceaccount test-runner \
-o name
`
echo "build docker images workflow submitted successfully"
source "${DIR}/check-argo-status.sh"
echo "build docker images workflow completed"

# Deploy the pipeline
source ${DIR}/deploy-pipeline.sh --gcr_image_base_dir ${GCR_IMAGE_BASE_DIR}
time source "${DIR}/deploy-pipeline-lite.sh"
echo "KFP lite deployed"

echo "submitting argo workflow to run tests for commit ${PULL_PULL_SHA}..."
ARGO_WORKFLOW=`argo submit ${DIR}/${WORKFLOW_FILE} \
Expand All @@ -119,7 +99,6 @@ ${IMAGE_BUILDER_ARG} \
--serviceaccount test-runner \
-o name
`

echo "test workflow submitted successfully"
source "${DIR}/check-argo-status.sh"
time source "${DIR}/check-argo-status.sh"
echo "test workflow completed"

0 comments on commit d11fae7

Please sign in to comment.