Skip to content

Commit

Permalink
TFjob v1 launcher (#2677)
Browse files Browse the repository at this point in the history
* TFjob v1 launcher

* fix comments
  • Loading branch information
hougangliu authored and k8s-ci-robot committed Dec 10, 2019
1 parent 016f2a3 commit 2923af7
Show file tree
Hide file tree
Showing 17 changed files with 277 additions and 661 deletions.
File renamed without changes.
11 changes: 6 additions & 5 deletions components/kubeflow/katib-launcher/build_image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,25 +34,26 @@ done

mkdir -p ./build
rsync -arvp ./src/ ./build/
rsync -arvp ../common/ ./build/

cp ../../license.sh ./build
cp ../../third_party_licenses.csv ./build

LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-experiment

docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} .
if [ -z "${TAG_NAME}" ]; then
TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6)
fi
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
if [ -z "${TAG_NAME}" ]; then
TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6)
fi
if [ -z "${PROJECT_ID}" ]; then
PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
fi
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
docker push gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
else
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} ${LAUNCHER_IMAGE_NAME}
docker push ${LAUNCHER_IMAGE_NAME}
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} ${LAUNCHER_IMAGE_NAME}:${TAG_NAME}
docker push ${LAUNCHER_IMAGE_NAME}:${TAG_NAME}
fi

rm -rf ./build
48 changes: 6 additions & 42 deletions components/kubeflow/launcher/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2018 The Kubeflow Authors
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -11,52 +11,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

FROM ubuntu:16.04

ARG TRAINER_IMAGE_NAME

RUN apt-get update -y

RUN apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git

RUN easy_install pip

RUN pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 tensorflow==1.7.0 \
kubernetes google-api-python-client retrying

RUN wget -nv https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.zip && \
unzip -qq google-cloud-sdk.zip -d tools && \
rm google-cloud-sdk.zip && \
tools/google-cloud-sdk/install.sh --usage-reporting=false \
--path-update=false --bash-completion=false \
--disable-installation-options && \
tools/google-cloud-sdk/bin/gcloud -q components update \
gcloud core gsutil && \
tools/google-cloud-sdk/bin/gcloud -q components install kubectl && \
tools/google-cloud-sdk/bin/gcloud config set component_manager/disable_update_check true && \
touch /tools/google-cloud-sdk/lib/third_party/google.py

RUN wget -nv https://github.com/ksonnet/ksonnet/releases/download/v0.9.0/ks_0.9.0_linux_amd64.tar.gz && \
tar -xzf ks_0.9.0_linux_amd64.tar.gz && \
mkdir -p /tools/ks/bin && \
cp ./ks_0.9.0_linux_amd64/ks /tools/ks/bin && \
rm ks_0.9.0_linux_amd64.tar.gz && \
rm -r ks_0.9.0_linux_amd64

RUN wget https://github.com/kubeflow/tf-operator/archive/v0.3.0.zip && \
unzip v0.3.0.zip && \
mv tf-operator-0.3.0 tf-operator

ENV PYTHONPATH $PYTHONPATH:/tf-operator

ENV PATH $PATH:/tools/google-cloud-sdk/bin:/tools/ks/bin

ENV TRAINER_IMAGE_NAME $TRAINER_IMAGE_NAME
RUN apt-get update -y && \
apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget && \
easy_install pip && \
pip install pyyaml==3.12 kubernetes

ADD build /ml

RUN mkdir /usr/licenses && \
/ml/license.sh /ml/third_party_licenses.csv /usr/licenses

ENTRYPOINT ["python", "/ml/launch_tf_job.py"]
ENTRYPOINT ["python", "/ml/launch_tfjob.py"]
4 changes: 4 additions & 0 deletions components/kubeflow/launcher/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
approvers:
- hougangliu
reviewers:
- hougangliu
57 changes: 14 additions & 43 deletions components/kubeflow/launcher/build_image.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash -e
# Copyright 2018 Google LLC
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -13,13 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.


while getopts ":hp:t:i:" opt; do
case "${opt}" in
h) echo "-p: project name"
echo "-t: tag name"
echo "-i: image name. If provided, project name and tag name are not necessary"
exit
echo "-t: tag name"
echo "-i: image name. If provided, project name and tag name are not necessary"
exit
;;
p) PROJECT_ID=${OPTARG}
;;
Expand All @@ -33,56 +32,28 @@ while getopts ":hp:t:i:" opt; do
esac
done

LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-tf
LOCAL_TRAINER_IMAGE_NAME=ml-pipeline-kubeflow-tf-trainer

if [ -z "${PROJECT_ID}" ]; then
PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
fi

if [ -z "${TAG_NAME}" ]; then
TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6)
fi

mkdir -p ./build
rsync -arvp ./src/ ./build/
rsync -arvp ../common/ ./build/

cp ../../license.sh ./build
cp ../../third_party_licenses.csv ./build

# Build the trainer image
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
TRAINER_IMAGE_NAME=gcr.io/${PROJECT_ID}/${LOCAL_TRAINER_IMAGE_NAME}:${TAG_NAME}
else
# construct the trainer image name as "laucher_image_name"-trainer:"launcher_image_tag"
colon_index=`expr index "${LAUNCHER_IMAGE_NAME}" :`
if [ $colon_index == '0' ]; then
TRAINER_IMAGE_NAME=${LAUNCHER_IMAGE_NAME}-trainer
else
tag=${LAUNCHER_IMAGE_NAME:$colon_index}
TRAINER_IMAGE_NAME=${LAUNCHER_IMAGE_NAME:0:$colon_index-1}-trainer:${tag}
fi
fi
LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-tfjob

bash_dir=`dirname $0`
bash_dir_abs=`realpath $bash_dir`
parent_dir=`dirname ${bash_dir_abs}`
trainer_dir=${parent_dir}/dnntrainer
cd ${trainer_dir}
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
./build_image.sh -p ${PROJECT_ID} -t ${TAG_NAME}
else
./build_image.sh -i ${TRAINER_IMAGE_NAME}
docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} .
if [ -z "${TAG_NAME}" ]; then
TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6)
fi
cd -

docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} . --build-arg TRAINER_IMAGE_NAME=${TRAINER_IMAGE_NAME}
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
if [ -z "${PROJECT_ID}" ]; then
PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
fi
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
docker push gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
else
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} "${LAUNCHER_IMAGE_NAME}"
docker push "${LAUNCHER_IMAGE_NAME}"
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} ${LAUNCHER_IMAGE_NAME}:${TAG_NAME}
docker push ${LAUNCHER_IMAGE_NAME}:${TAG_NAME}
fi

rm -rf ./build
35 changes: 35 additions & 0 deletions components/kubeflow/launcher/component.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Kubeflow - Launch TFJob
description: Kubeflow TFJob launcher
inputs:
- {name: Name, type: String, description: 'TFJob name.'}
- {name: Namespace, type: String, default: kubeflow, description: 'TFJob namespace.'}
- {name: Version, type: String, default: v1, description: 'TFJob version.'}
- {name: ActiveDeadlineSeconds, type: Integer, default: -1, description: 'Specifies the duration (in seconds) since startTime during which the job can remain active before it is terminated. Must be a positive integer. This setting applies only to pods where restartPolicy is OnFailure or Always.'}
- {name: BackoffLimit, type: Integer, default: -1, description: 'Number of retries before marking this job as failed.'}
- {name: ttl Seconds After Finished, type: Integer, default: -1, description: 'Defines the TTL for cleaning up finished TFJobs.'}
- {name: CleanPodPolicy, type: String, default: Running, description: 'Defines the policy for cleaning up pods after the TFJob completes.'}
- {name: PS Spec, type: JSON, default: '{}', description: 'TFJob ps replicaSpecs.'}
- {name: Worker Spec, type: JSON, default: '{}', description: 'TFJob worker replicaSpecs.'}
- {name: Chief Spec, type: JSON, default: '{}', description: 'TFJob chief replicaSpecs.'}
- {name: Evaluator Spec, type: JSON, default: '{}', description: 'TFJob evaluator replicaSpecs.'}
- {name: Tfjob Timeout Minutes, type: Integer, default: 1440, description: 'Time in minutes to wait for the TFJob to complete.'}
- {name: Delete Finished Tfjob, type: Bool, default: 'True' , description: 'Whether to delete the tfjob after it is finished.'}
implementation:
container:
image: liuhougangxa/kubeflow-tfjob-launcher:latest
command: [python, /ml/launch_tfjob.py]
args: [
--name, {inputValue: Name},
--namespace, {inputValue: Namespace},
--version, {inputValue: Version},
--activeDeadlineSeconds, {inputValue: ActiveDeadlineSeconds},
--backoffLimit, {inputValue: BackoffLimit},
--cleanPodPolicy, {inputValue: CleanPodPolicy},
--ttlSecondsAfterFinished, {inputValue: ttl Seconds After Finished},
--psSpec, {inputValue: PS Spec},
--workerSpec, {inputValue: Worker Spec},
--chiefSpec, {inputValue: Chief Spec},
--evaluatorSpec, {inputValue: Evaluator Spec},
--tfjobTimeoutMinutes, {inputValue: Tfjob Timeout Minutes},
--deleteAfterDone, {inputValue: Delete Finished Tfjob},
]
34 changes: 0 additions & 34 deletions components/kubeflow/launcher/kubeflow_tfjob_launcher_op.py

This file was deleted.

75 changes: 75 additions & 0 deletions components/kubeflow/launcher/sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import json
from kfp import components
import kfp.dsl as dsl

@dsl.pipeline(
name="Launch kubeflow tfjob",
description="An example to launch tfjob."
)
def mnist_train(
name="mnist",
namespace="kubeflow",
workerNum=3,
ttlSecondsAfterFinished=-1,
tfjobTimeoutMinutes=60,
deleteAfterDone=False):
tfjob_launcher_op = components.load_component_from_file("./component.yaml")
# tfjob_launcher_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/launcher/component.yaml')

chief = {
"replicas": 1,
"restartPolicy": "OnFailure",
"template": {
"spec": {
"containers": [
{
"command": [
"python",
"/opt/model.py"
],
"args": [
"--tf-train-steps=6000"
],
"image": "liuhougangxa/tf-estimator-mnist",
"name": "tensorflow",
}
]
}
}
}
worker = {}
if workerNum > 0:
worker = {
"replicas": workerNum,
"restartPolicy": "OnFailure",
"template": {
"spec": {
"containers": [
{
"command": [
"python",
"/opt/model.py"
],
"args": [
"--tf-train-steps=6000"
],
"image": "liuhougangxa/tf-estimator-mnist",
"name": "tensorflow",
}
]
}
}
}
tfjob_launcher_op(
name=name,
namespace=namespace,
ttl_seconds_after_finished=ttlSecondsAfterFinished,
worker_spec=worker,
chief_spec=chief,
tfjob_timeout_minutes=tfjobTimeoutMinutes,
delete_finished_tfjob=deleteAfterDone
)

if __name__ == "__main__":
import kfp.compiler as compiler
compiler.Compiler().compile(mnist_train, __file__ + ".tar.gz")
4 changes: 1 addition & 3 deletions components/kubeflow/launcher/src/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2018 Google LLC
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .kubeflow_tfjob_launcher_op import kubeflow_tfjob_launcher_op
Loading

0 comments on commit 2923af7

Please sign in to comment.