Skip to content

GPU support #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Feb 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ services:
script:
- docker build . -f images/spark-base/Dockerfile -t cortexlabs/spark-base:latest
- docker build . -f images/tf-base/Dockerfile -t cortexlabs/tf-base:latest
- docker build . -f images/tf-base-gpu/Dockerfile -t cortexlabs/tf-base-gpu:latest

- ./build/images.sh images/operator operator
- ./build/images.sh images/spark spark
- ./build/images.sh images/spark-operator spark-operator
- ./build/images.sh images/tf-train tf-train
- ./build/images.sh images/tf-serve tf-serve
- ./build/images.sh images/tf-api tf-api
- ./build/images.sh images/tf-serve-gpu tf-serve-gpu
- ./build/images.sh images/tf-train-gpu tf-train-gpu
- ./build/images.sh images/nginx-controller nginx-controller
- ./build/images.sh images/nginx-backend nginx-backend
- ./build/images.sh images/fluentd fluentd
Expand Down
4 changes: 4 additions & 0 deletions cortex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORT
export CORTEX_IMAGE_TF_TRAIN="${CORTEX_IMAGE_TF_TRAIN:-cortexlabs/tf-train:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/python-packager:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"

export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}"
export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}"
Expand Down Expand Up @@ -291,6 +293,8 @@ function setup_configmap() {
--from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \
--from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \
--from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \
--from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \
--from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \
-o yaml --dry-run | kubectl apply -f - >/dev/null
}

Expand Down
7 changes: 6 additions & 1 deletion dev/eks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ function eks_set_cluster() {
}

if [ "$1" = "start" ]; then
eksctl create cluster --version=1.11 --name=$K8S_NAME --region $K8S_REGION --nodes=$K8S_NODE_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
eksctl create cluster --version=1.11 --name=$K8S_NAME --region $K8S_REGION --nodes-max $K8S_NODES_MAX_COUNT --nodes-min $K8S_NODES_MIN_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
if [ $K8S_GPU_NODES_MIN_COUNT -gt 0 ] || [ $K8S_GPU_NODES_MAX_COUNT -gt 0 ]; then
eksctl create nodegroup --version=1.11 --cluster=$K8S_NAME --nodes-max=$K8S_GPU_NODES_MAX_COUNT --nodes-min=$K8S_GPU_NODES_MIN_COUNT --node-type=$K8S_GPU_NODE_INSTANCE_TYPE --node-ami=$K8S_GPU_NODE_AMI
echo "Once the GPU nodegroup joins the cluster, run:"
echo "kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml"
fi
eks_set_cluster

elif [ "$1" = "update" ]; then
Expand Down
4 changes: 2 additions & 2 deletions dev/kops.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ spec:
image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17
machineType: ${K8S_NODE_INSTANCE_TYPE}
rootVolumeSize: ${K8S_NODE_VOLUME_SIZE}
maxSize: ${K8S_NODE_COUNT}
minSize: ${K8S_NODE_COUNT}
maxSize: ${K8S_NODES_MAX_COUNT}
minSize: ${K8S_NODES_MIN_COUNT}
nodeLabels:
kops.k8s.io/instancegroup: nodes
role: Node
Expand Down
5 changes: 5 additions & 0 deletions dev/registry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ function create_registry() {
aws ecr create-repository --repository-name=cortexlabs/tf-train --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/tf-api --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/python-packager --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
}

### HELPERS ###
Expand Down Expand Up @@ -115,6 +117,7 @@ elif [ "$cmd" = "update" ]; then
cache_builder $ROOT/images/spark-base spark-base
build_base $ROOT/images/spark-base spark-base
build_base $ROOT/images/tf-base tf-base
build_base $ROOT/images/tf-base-gpu tf-base-gpu

cache_builder $ROOT/images/operator operator
build_and_push $ROOT/images/operator operator latest
Expand All @@ -128,11 +131,13 @@ elif [ "$cmd" = "update" ]; then
build_and_push $ROOT/images/argo-controller argo-controller latest
build_and_push $ROOT/images/argo-executor argo-executor latest
build_and_push $ROOT/images/tf-serve tf-serve latest
build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
build_and_push $ROOT/images/python-packager python-packager latest
fi

build_and_push $ROOT/images/spark spark latest
build_and_push $ROOT/images/tf-train tf-train latest
build_and_push $ROOT/images/tf-train-gpu tf-train-gpu latest
build_and_push $ROOT/images/tf-api tf-api latest

cleanup
Expand Down
7 changes: 7 additions & 0 deletions docs/applications/advanced/compute.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ For example:
compute:
cpu: "2"
mem: "1Gi"
gpu: 1
```

CPU and memory requests in Cortex correspond to compute resource requests in Kubernetes. In the example above, the training job will only be scheduled once 2 CPUs and 1Gi of memory are available, and the job will be guaranteed to have access to those resources throughout it's execution. In some cases, a Cortex compute resource request can be (or may default to) `Null`.
Expand All @@ -21,3 +22,9 @@ One unit of CPU corresponds to one virtual CPU on AWS. Fractional requests are a
## Memory

One unit of memory is one byte. Memory can be expressed as an integer or by using one of these suffixes: `K`, `M`, `G`, `T` (or their power-of two counterparts: `Ki`, `Mi`, `Gi`, `Ti`). For example, the following values represent roughly the same memory: `128974848`, `129e6`, `129M`, `123Mi`.

## GPU
One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html).

## GPU Support
We recommend using GPU compute requests on API resources only if you have enough nodes in your cluster to support the number of GPU requests in model training plus APIs (ideally with an autoscaler). Otherwise, due to the nature of zero downtime rolling updates, your model training will not have sufficient GPU resources as there will always be GPUs consumed by APIs from the previous deployment.
1 change: 1 addition & 0 deletions docs/applications/resources/apis.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Serve models at scale and use them to build smarter applications.
replicas: <int> # number of replicas to launch (default: 1)
cpu: <string> # CPU request (default: Null)
mem: <string> # memory request (default: Null)
gpu: <string> # gpu request (default: Null)
tags:
<string>: <scalar> # arbitrary key/value pairs to attach to the resource (optional)
...
Expand Down
1 change: 1 addition & 0 deletions docs/applications/resources/models.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Train custom TensorFlow models at scale.
compute:
cpu: <string> # CPU request (default: Null)
mem: <string> # memory request (default: Null)
gpu: <string> # GPU request (default: Null)

tags:
<string>: <scalar> # arbitrary key/value pairs to attach to the resource (optional)
Expand Down
4 changes: 2 additions & 2 deletions docs/applications/resources/statuses.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
| terminated | Resource was terminated |
| upstream error | Resource was not created due to an error in one of its dependencies |
| upstream termination | Resource was not created because one of its dependencies was terminated |
| compute unavailable | Resource's workload could not start due to insufficient memory or CPU in the cluster |
| compute unavailable | Resource's workload could not start due to insufficient memory, CPU or GPU in the cluster |

## API statuses

Expand All @@ -29,4 +29,4 @@
| update skipped | API was not updated due to an error in another resource; a previous version of this API is ready |
| upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready |
| upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready |
| compute unavailable | API could not start due to insufficient memory or CPU in the cluster; some replicas may be ready |
| compute unavailable | API could not start due to insufficient memory, CPU or GPU in the cluster; some replicas may be ready |
2 changes: 1 addition & 1 deletion images/python-packager/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:18.04
FROM ubuntu:16.04

RUN apt-get update -qq && apt-get install -y -q \
python3 \
Expand Down
4 changes: 2 additions & 2 deletions images/spark-base/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:18.04 as builder
FROM ubuntu:16.04 as builder

RUN apt-get update -qq && apt-get install -y -q \
git \
Expand Down Expand Up @@ -47,7 +47,7 @@ RUN wget -q -P $SPARK_HOME/jars/ http://central.maven.org/maven2/com/amazonaws/a
COPY images/spark-base/conf/* $SPARK_HOME/conf/


FROM ubuntu:18.04
FROM ubuntu:16.04

ENV JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"
ENV HADOOP_HOME="/opt/hadoop"
Expand Down
5 changes: 5 additions & 0 deletions images/tf-base-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM tensorflow/tensorflow:1.12.0-gpu-py3

RUN apt-get update -qq && apt-get install -y -q \
zlib1g-dev \
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/*
17 changes: 1 addition & 16 deletions images/tf-base/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,21 +1,6 @@
FROM ubuntu:18.04

ARG TF_VERSION="1.12.0"
FROM tensorflow/tensorflow:1.12.0-py3

RUN apt-get update -qq && apt-get install -y -q \
build-essential \
curl \
libfreetype6-dev \
libpng-dev \
libzmq3-dev \
pkg-config \
python3 \
python3-dev \
python3-pip \
rsync \
software-properties-common \
unzip \
zlib1g-dev \
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/*

RUN pip3 install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-${TF_VERSION}-cp36-cp36m-linux_x86_64.whl && rm -rf /root/.cache/pip*
8 changes: 8 additions & 0 deletions images/tf-serve-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM cortexlabs/tf-base-gpu

ARG TF_VERSION="1.12.0"

RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb
RUN dpkg -i tensorflow-model-server.deb

ENTRYPOINT ["tensorflow_model_server"]
18 changes: 1 addition & 17 deletions images/tf-serve/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,9 @@
FROM ubuntu:18.04
FROM cortexlabs/tf-base

ARG TF_VERSION="1.12.0"

RUN apt-get update -qq && apt-get install -y -q \
automake \
build-essential \
curl \
libcurl3-dev \
git \
libtool \
libfreetype6-dev \
libpng-dev \
libzmq3-dev \
pkg-config \
python3-dev \
python3-numpy \
python3-pip \
software-properties-common \
swig \
zip \
zlib1g-dev \
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/*

RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb
Expand Down
15 changes: 15 additions & 0 deletions images/tf-train-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM cortexlabs/tf-base-gpu

ENV PYTHONPATH="/src:${PYTHONPATH}"

COPY pkg/workloads/lib/requirements.txt /src/lib/requirements.txt
COPY pkg/workloads/tf_train/requirements.txt /src/tf_train/requirements.txt
RUN pip3 install -r /src/lib/requirements.txt && \
pip3 install -r /src/tf_train/requirements.txt && \
rm -rf /root/.cache/pip*

COPY pkg/workloads/consts.py /src/
COPY pkg/workloads/lib /src/lib
COPY pkg/workloads/tf_train /src/tf_train

ENTRYPOINT ["/usr/bin/python3", "/src/tf_train/train.py"]
28 changes: 28 additions & 0 deletions pkg/api/userconfig/compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ func (sparkCompute *SparkCompute) ID() string {
type TFCompute struct {
CPU *Quantity `json:"cpu" yaml:"cpu"`
Mem *Quantity `json:"mem" yaml:"mem"`
GPU *int64 `json:"gpu" yaml:"gpu"`
}

var tfComputeFieldValidation = &cr.StructFieldValidation{
Expand All @@ -166,6 +167,13 @@ var tfComputeFieldValidation = &cr.StructFieldValidation{
Min: k8sresource.MustParse("0"),
}),
},
&cr.StructFieldValidation{
StructField: "GPU",
Int64PtrValidation: &cr.Int64PtrValidation{
Default: nil,
GreaterThan: util.Int64Ptr(0),
},
},
},
},
}
Expand All @@ -181,6 +189,7 @@ type APICompute struct {
Replicas int32 `json:"replicas" yaml:"replicas"`
CPU *Quantity `json:"cpu" yaml:"cpu"`
Mem *Quantity `json:"mem" yaml:"mem"`
GPU int64 `json:"gpu" yaml:"gpu"`
}

var apiComputeFieldValidation = &cr.StructFieldValidation{
Expand Down Expand Up @@ -212,6 +221,13 @@ var apiComputeFieldValidation = &cr.StructFieldValidation{
Min: k8sresource.MustParse("0"),
}),
},
&cr.StructFieldValidation{
StructField: "GPU",
Int64Validation: &cr.Int64Validation{
Default: 0,
GreaterThanOrEqualTo: util.Int64Ptr(0),
},
},
},
},
}
Expand All @@ -221,13 +237,15 @@ func (apiCompute *APICompute) ID() string {
buf.WriteString(s.Int32(apiCompute.Replicas))
buf.WriteString(QuantityPtrID(apiCompute.CPU))
buf.WriteString(QuantityPtrID(apiCompute.Mem))
buf.WriteString(s.Int64(apiCompute.GPU))
return util.HashBytes(buf.Bytes())
}

func (apiCompute *APICompute) IDWithoutReplicas() string {
var buf bytes.Buffer
buf.WriteString(QuantityPtrID(apiCompute.CPU))
buf.WriteString(QuantityPtrID(apiCompute.Mem))
buf.WriteString(s.Int64(apiCompute.GPU))
return util.HashBytes(buf.Bytes())
}

Expand Down Expand Up @@ -284,6 +302,11 @@ func MaxTFCompute(tfComputes ...*TFCompute) *TFCompute {
aggregated.Mem = tfCompute.Mem
}
}
if tfCompute.GPU != nil {
if aggregated.GPU == nil || *tfCompute.GPU > *aggregated.GPU {
aggregated.GPU = tfCompute.GPU
}
}
}

return &aggregated
Expand All @@ -299,5 +322,10 @@ func (apiCompute *APICompute) Equal(apiCompute2 APICompute) bool {
if !QuantityPtrsEqual(apiCompute.Mem, apiCompute2.Mem) {
return false
}

if apiCompute.GPU != apiCompute2.GPU {
return false
}

return true
}
4 changes: 4 additions & 0 deletions pkg/operator/cortexconfig/cortex_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ var (
TFServeImage string
TFAPIImage string
PythonPackagerImage string
TFTrainImageGPU string
TFServeImageGPU string
)

func init() {
Expand All @@ -47,6 +49,8 @@ func init() {
TFServeImage = getStr("IMAGE_TF_SERVE")
TFAPIImage = getStr("IMAGE_TF_API")
PythonPackagerImage = getStr("IMAGE_PYTHON_PACKAGER")
TFTrainImageGPU = getStr("IMAGE_TF_TRAIN_GPU")
TFServeImageGPU = getStr("IMAGE_TF_SERVE_GPU")
}

//
Expand Down
Loading