Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switching test to kubeflow deployment #351

Merged
merged 52 commits into from
Nov 29, 2018
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
1c7f2c9
test
IronPan Nov 21, 2018
213795b
fix
IronPan Nov 21, 2018
47d5457
fix
IronPan Nov 21, 2018
0ea4e69
fix
IronPan Nov 21, 2018
e2f5ea1
fix
IronPan Nov 21, 2018
bf05e06
fix
IronPan Nov 21, 2018
b83c4c1
update
IronPan Nov 21, 2018
21f88d0
cleanup
IronPan Nov 21, 2018
8c3f83f
fix
IronPan Nov 21, 2018
b78dc1b
coopy test
IronPan Nov 21, 2018
1323ab4
chmod
IronPan Nov 21, 2018
ab93fe8
fix
IronPan Nov 21, 2018
39d1712
fix
IronPan Nov 21, 2018
f694670
fix
IronPan Nov 21, 2018
1974bd3
fix
IronPan Nov 21, 2018
61a8ef4
fix
IronPan Nov 21, 2018
33cad5b
fix
IronPan Nov 21, 2018
97de358
fix
IronPan Nov 21, 2018
08a54ef
fix
IronPan Nov 21, 2018
6afcc66
fix
IronPan Nov 21, 2018
fa492e7
fix
IronPan Nov 21, 2018
7bd127a
fix
IronPan Nov 22, 2018
8b37fa2
fix
IronPan Nov 22, 2018
1d55b6f
fix
IronPan Nov 22, 2018
0776bf6
fix
IronPan Nov 22, 2018
fae10e9
fix
IronPan Nov 22, 2018
f198ccb
fix
IronPan Nov 22, 2018
ed04d33
fix
IronPan Nov 22, 2018
bafc268
fix
IronPan Nov 22, 2018
2ef2610
fix
IronPan Nov 22, 2018
41979ba
fix
IronPan Nov 22, 2018
f449107
fix
IronPan Nov 22, 2018
1b86ad0
fix
IronPan Nov 22, 2018
7c19bc9
fix
IronPan Nov 22, 2018
84bbc39
fix
IronPan Nov 22, 2018
1beaae7
fix
IronPan Nov 22, 2018
bb6420d
update
IronPan Nov 22, 2018
2fde6f4
fix
IronPan Nov 22, 2018
4a98037
fix
IronPan Nov 22, 2018
ab647bf
fix
IronPan Nov 22, 2018
d2e073d
fix
IronPan Nov 22, 2018
ed6b9e8
fix
IronPan Nov 22, 2018
da34b8d
fix
IronPan Nov 22, 2018
75c89c9
fix
IronPan Nov 22, 2018
63c9832
fix sample test
IronPan Nov 22, 2018
5c73e01
fix
IronPan Nov 22, 2018
25a49d9
fix
IronPan Nov 22, 2018
a5a14dc
merge
IronPan Nov 28, 2018
2701495
merge
IronPan Nov 28, 2018
94edfa8
update image builder image
IronPan Nov 28, 2018
2fdb5c7
update script
IronPan Nov 29, 2018
776034f
mount permission
IronPan Nov 29, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 1 addition & 13 deletions test/build_image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ spec:
parameters:
- name: commit-sha
value: master
- name: bootstrapper-image
- name: api-image
- name: frontend-image
- name: scheduledworkflow-image
Expand All @@ -32,23 +31,12 @@ spec:
inputs:
parameters:
- name: commit-sha
- name: bootstrapper-image
- name: api-image
- name: frontend-image
- name: scheduledworkflow-image
- name: persistenceagent-image
steps:
- - name: build-bootstrapper-image
template: build-image
arguments:
parameters:
- name: commit-sha
value: "{{inputs.parameters.commit-sha}}"
- name: docker-path
value: pipeline
- name: image-name
value: "{{inputs.parameters.bootstrapper-image}}"
- name: build-api-server-image
- - name: build-api-server-image
template: build-image
arguments:
parameters:
Expand Down
51 changes: 51 additions & 0 deletions test/check-argo-status.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

echo "check status of argo workflow $ARGO_WORKFLOW...."
# probing the argo workflow status until it completed. Timeout after 30 minutes
for i in $(seq 1 ${PULL_ARGO_WORKFLOW_STATUS_MAX_ATTEMPT})
do
WORKFLOW_STATUS=`kubectl get workflow $ARGO_WORKFLOW --show-labels`
echo $WORKFLOW_STATUS | grep ${WORKFLOW_COMPLETE_KEYWORD} && s=0 && break || s=$? && printf "Workflow ${ARGO_WORKFLOW} is not finished.\n${WORKFLOW_STATUS}\nSleep for 20 seconds...\n" && sleep 20
done

# Check whether the argo workflow finished or not and exit if not.
if [[ $s != 0 ]]; then
echo "Prow job Failed: Argo workflow timeout.."
argo logs -w ${ARGO_WORKFLOW}
exit $s
fi

echo "Argo workflow finished."

if [[ ! -z "$TEST_RESULT_FOLDER" ]]
then
echo "Copy test result"
mkdir -p $ARTIFACT_DIR
gsutil cp -r "${TEST_RESULTS_GCS_DIR}"/* "${ARTIFACT_DIR}" || true
fi

ARGO_WORKFLOW_DETAILS=`argo get ${ARGO_WORKFLOW}`
ARGO_WORKFLOW_LOGS=`argo logs -w ${ARGO_WORKFLOW}`

if [[ $WORKFLOW_STATUS = *"${WORKFLOW_FAILED_KEYWORD}"* ]]; then
printf "The argo workflow failed.\n =========Argo Workflow=========\n${ARGO_WORKFLOW_DETAILS}\n==================\n"
printf "=========Argo Workflow Logs=========\n${ARGO_WORKFLOW_LOGS}\n==================\n"
exit 1
else
printf ${ARGO_WORKFLOW_DETAILS}
exit 0
fi
43 changes: 0 additions & 43 deletions test/e2e_test_gke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,6 @@ spec:
parameters:
- name: commit-sha
- name: test-results-gcs-dir
- name: bootstrapper-image
- name: api-image
- name: frontend-image
- name: scheduledworkflow-image
- name: persistenceagent-image
- name: api-integration-test-image-suffix
value: api_integration_test
- name: frontend-integration-tests-image-suffix
Expand All @@ -43,11 +38,6 @@ spec:
parameters:
- name: commit-sha
- name: test-results-gcs-dir
- name: bootstrapper-image
- name: api-image
- name: frontend-image
- name: scheduledworkflow-image
- name: persistenceagent-image
- name: api-integration-test-image-suffix
- name: frontend-integration-tests-image-suffix
- name: basic-e2e-tests-image-suffix
Expand Down Expand Up @@ -91,20 +81,6 @@ spec:
value: test/sample-test
- name: image-suffix
value: "{{inputs.parameters.basic-e2e-tests-image-suffix}}"
- - name: deploy-ml-pipeline
template: deploy-ml-pipeline
arguments:
parameters:
- name: bootstrapper-image
value: "{{inputs.parameters.bootstrapper-image}}"
- name: api-image
value: "{{inputs.parameters.api-image}}"
- name: frontend-image
value: "{{inputs.parameters.frontend-image}}"
- name: scheduledworkflow-image
value: "{{inputs.parameters.scheduledworkflow-image}}"
- name: persistenceagent-image
value: "{{inputs.parameters.persistenceagent-image}}"
- - name: run-api-integration-tests
template: run-api-integration-tests
arguments:
Expand Down Expand Up @@ -239,25 +215,6 @@ spec:
privileged: true
mirrorVolumeMounts: true

- name: deploy-ml-pipeline
inputs:
parameters:
- name: bootstrapper-image
- name: api-image
- name: frontend-image
- name: scheduledworkflow-image
- name: persistenceagent-image
container:
image: "{{inputs.parameters.bootstrapper-image}}"
args: [
"--api_image", "{{inputs.parameters.api-image}}",
"--ui_image", "{{inputs.parameters.frontend-image}}",
"--scheduled_workflow_image", "{{inputs.parameters.scheduledworkflow-image}}",
"--persistence_agent_image", "{{inputs.parameters.persistenceagent-image}}",
"--deploy_argo", "false", # Argo is already installed in the cluster. No need to install again.
"--report_usage", "false" # Skip reporting usage for test
]

- name: run-api-integration-tests
inputs:
parameters:
Expand Down
139 changes: 139 additions & 0 deletions test/presubmit-tests-with-pipeline-deployment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#!/bin/bash
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -xe

usage()
{
echo "usage: deploy.sh
[--workflow_file the file name of the argo workflow to run]
[--test_result_bucket the gcs bucket that argo workflow store the result to. Default is ml-pipeline-test
[--test_result_folder the gcs folder that argo workflow store the result to. Always a relative directory to gs://<gs_bucket>/[PULL_SHA]]
[--timeout timeout of the tests in seconds. Default is 1800 seconds. ]
[-h help]"
}

PROJECT=ml-pipeline-test
TEST_RESULT_BUCKET=ml-pipeline-test
GCR_IMAGE_BASE_DIR=gcr.io/ml-pipeline-test/${PULL_PULL_SHA}
TIMEOUT_SECONDS=1800

while [ "$1" != "" ]; do
case $1 in
--workflow_file ) shift
WORKFLOW_FILE=$1
;;
--test_result_bucket ) shift
TEST_RESULT_BUCKET=$1
;;
--test_result_folder ) shift
TEST_RESULT_FOLDER=$1
;;
--timeout ) shift
TIMEOUT_SECONDS=$1
;;
-h | --help ) usage
exit
;;
* ) usage
exit 1
esac
shift
done

TEST_RESULTS_GCS_DIR=gs://${TEST_RESULT_BUCKET}/${PULL_PULL_SHA}/${TEST_RESULT_FOLDER}
ARTIFACT_DIR=$WORKSPACE/_artifacts
WORKFLOW_COMPLETE_KEYWORD="completed=true"
WORKFLOW_FAILED_KEYWORD="phase=Failed"
PULL_ARGO_WORKFLOW_STATUS_MAX_ATTEMPT=$(expr $TIMEOUT_SECONDS / 20 )

echo "presubmit test starts"

# activating the service account
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
gcloud config set compute/zone us-central1-a

# Install ksonnet
KS_VERSION="0.11.0"
curl -LO https://github.com/ksonnet/ksonnet/releases/download/v${KS_VERSION}/ks_${KS_VERSION}_linux_amd64.tar.gz
tar -xzf ks_${KS_VERSION}_linux_amd64.tar.gz
chmod +x ./ks_${KS_VERSION}_linux_amd64/ks
mv ./ks_${KS_VERSION}_linux_amd64/ks /usr/local/bin/

# Install kubeflow
KUBEFLOW_MASTER=$(pwd)/kubeflow_master
git clone https://github.com/kubeflow/kubeflow.git ${KUBEFLOW_MASTER}

## Download latest release source code
KUBEFLOW_SRC=$(pwd)/kubeflow_latest_release
mkdir ${KUBEFLOW_SRC}
cd ${KUBEFLOW_SRC}
export KUBEFLOW_TAG=v0.3.1
curl https://raw.githubusercontent.com/kubeflow/kubeflow/${KUBEFLOW_TAG}/scripts/download.sh | bash

## Override the pipeline config with code from master
cp -r ${KUBEFLOW_MASTER}/kubeflow/pipeline ${KUBEFLOW_SRC}/kubeflow/pipeline
cp -r ${KUBEFLOW_MASTER}/kubeflow/argo ${KUBEFLOW_SRC}/kubeflow/argo

TEST_CLUSTER_PREFIX=${WORKFLOW_FILE%.*}
TEST_CLUSTER=$(echo $TEST_CLUSTER_PREFIX | cut -d _ -f 1)-${PULL_PULL_SHA:0:7}-${RANDOM}
function delete_cluster {
echo "Delete cluster..."
gcloud container clusters delete ${TEST_CLUSTER} --async
}
# trap delete_cluster EXIT

export CLIENT_ID=${RANDOM}
export CLIENT_SECRET=${RANDOM}
KFAPP=$(pwd)/${TEST_CLUSTER}

function clean_up {
echo "Clean up..."
cd ${KFAPP}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fails since $KFAPP is relative path.

${KUBEFLOW_SRC}/scripts/kfctl.sh delete all
}
# trap delete_cluster EXIT

${KUBEFLOW_SRC}/scripts/kfctl.sh init ${KFAPP} --platform gcp --project ${PROJECT}
cd ${KFAPP}
${KUBEFLOW_SRC}/scripts/kfctl.sh generate platform
${KUBEFLOW_SRC}/scripts/kfctl.sh apply platform
${KUBEFLOW_SRC}/scripts/kfctl.sh generate k8s

## Update pipeline component image
pushd ks_app
ks param set pipeline apiImage ${GCR_IMAGE_BASE_DIR}/api:${PULL_PULL_SHA}
ks param set pipeline persistenceAgentImage ${GCR_IMAGE_BASE_DIR}/persistenceagent:${PULL_PULL_SHA}
ks param set pipeline scheduledWorkflowImage ${GCR_IMAGE_BASE_DIR}/scheduledworkflow:${PULL_PULL_SHA}
ks param set pipeline uiImage ${GCR_IMAGE_BASE_DIR}/frontend:${PULL_PULL_SHA}
popd

${KUBEFLOW_SRC}/scripts/kfctl.sh apply k8s

gcloud container clusters get-credentials ${TEST_CLUSTER}

echo "submitting argo workflow for commit ${PULL_PULL_SHA}..."
ARGO_WORKFLOW=`argo submit $(dirname $0)/${WORKFLOW_FILE} \
-p commit-sha="${PULL_PULL_SHA}" \
-p test-results-gcs-dir="${TEST_RESULTS_GCS_DIR}" \
-p cluster-type="${CLUSTER_TYPE}" \
-o name
`
echo argo workflow submitted successfully

DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"
source "${DIR}/check-argo-status.sh"

45 changes: 4 additions & 41 deletions test/presubmit-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ usage()
[-h help]"
}

PROJECT=ml-pipeline-test
TEST_RESULT_BUCKET=ml-pipeline-test
GCR_IMAGE_BASE_DIR=gcr.io/ml-pipeline-test/${PULL_PULL_SHA}
CLUSTER_TYPE=create-gke
Expand Down Expand Up @@ -58,7 +59,6 @@ while [ "$1" != "" ]; do
shift
done

ZONE=us-west1-a
TEST_RESULTS_GCS_DIR=gs://${TEST_RESULT_BUCKET}/${PULL_PULL_SHA}/${TEST_RESULT_FOLDER}
ARTIFACT_DIR=$WORKSPACE/_artifacts
WORKFLOW_COMPLETE_KEYWORD="completed=true"
Expand All @@ -69,8 +69,8 @@ echo "presubmit test starts"

# activating the service account
gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"
gcloud config set compute/zone us-central1-a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we hard-coding the zone?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently we only requested quota for this zone.


#Creating a new GKE cluster if needed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reverted

if [ "$CLUSTER_TYPE" == "create-gke" ]; then
echo "create test cluster"
TEST_CLUSTER_PREFIX=${WORKFLOW_FILE%.*}
Expand All @@ -97,7 +97,6 @@ if [ "$CLUSTER_TYPE" == "create-gke" ]; then
fi

kubectl config set-context $(kubectl config current-context) --namespace=default

echo "Add necessary cluster role bindings"
ACCOUNT=$(gcloud info --format='value(config.account)')
kubectl create clusterrolebinding PROW_BINDING --clusterrole=cluster-admin --user=$ACCOUNT
Expand All @@ -109,17 +108,14 @@ mkdir -p ~/bin/
export PATH=~/bin/:$PATH
curl -sSL -o ~/bin/argo https://github.com/argoproj/argo/releases/download/$ARGO_VERSION/argo-linux-amd64
chmod +x ~/bin/argo

kubectl create ns argo
kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/$ARGO_VERSION/manifests/install.yaml


echo "submitting argo workflow for commit ${PULL_PULL_SHA}..."
ARGO_WORKFLOW=`argo submit $(dirname $0)/${WORKFLOW_FILE} \
-p commit-sha="${PULL_PULL_SHA}" \
-p test-results-gcs-dir="${TEST_RESULTS_GCS_DIR}" \
-p cluster-type="${CLUSTER_TYPE}" \
-p bootstrapper-image="${GCR_IMAGE_BASE_DIR}/bootstrapper" \
-p api-image="${GCR_IMAGE_BASE_DIR}/api" \
-p frontend-image="${GCR_IMAGE_BASE_DIR}/frontend" \
-p scheduledworkflow-image="${GCR_IMAGE_BASE_DIR}/scheduledworkflow" \
Expand All @@ -128,38 +124,5 @@ ARGO_WORKFLOW=`argo submit $(dirname $0)/${WORKFLOW_FILE} \
`
echo argo workflow submitted successfully

echo "check status of argo workflow $ARGO_WORKFLOW...."
# probing the argo workflow status until it completed. Timeout after 30 minutes
for i in $(seq 1 ${PULL_ARGO_WORKFLOW_STATUS_MAX_ATTEMPT})
do
WORKFLOW_STATUS=`kubectl get workflow $ARGO_WORKFLOW --show-labels`
echo $WORKFLOW_STATUS | grep ${WORKFLOW_COMPLETE_KEYWORD} && s=0 && break || s=$? && printf "Workflow ${ARGO_WORKFLOW} is not finished.\n${WORKFLOW_STATUS}\nSleep for 20 seconds...\n" && sleep 20
done

# Check whether the argo workflow finished or not and exit if not.
if [[ $s != 0 ]]; then
echo "Prow job Failed: Argo workflow timeout.."
argo logs -w ${ARGO_WORKFLOW}
exit $s
fi

echo "Argo workflow finished."

if [[ ! -z "$TEST_RESULT_FOLDER" ]]
then
echo "Copy test result"
mkdir -p $ARTIFACT_DIR
gsutil cp -r "${TEST_RESULTS_GCS_DIR}"/* "${ARTIFACT_DIR}" || true
fi

ARGO_WORKFLOW_DETAILS=`argo get ${ARGO_WORKFLOW}`
ARGO_WORKFLOW_LOGS=`argo logs -w ${ARGO_WORKFLOW}`

if [[ $WORKFLOW_STATUS = *"${WORKFLOW_FAILED_KEYWORD}"* ]]; then
printf "The argo workflow failed.\n =========Argo Workflow=========\n${ARGO_WORKFLOW_DETAILS}\n==================\n"
printf "=========Argo Workflow Logs=========\n${ARGO_WORKFLOW_LOGS}\n==================\n"
exit 1
else
printf ${ARGO_WORKFLOW_DETAILS}
exit 0
fi
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)"
source "${DIR}/check-argo-status.sh"
Loading