Skip to content

Commit cff9fac

Browse files
authored
GPU support (#6)
* add GPU support * separate GPU docker images * add gpu to compute docs * update dockerfiles - remove license and use tf image directly * use ubuntu 16
1 parent 3c3f65f commit cff9fac

File tree

22 files changed

+134
-50
lines changed

22 files changed

+134
-50
lines changed

.travis.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@ services:
1313
script:
1414
- docker build . -f images/spark-base/Dockerfile -t cortexlabs/spark-base:latest
1515
- docker build . -f images/tf-base/Dockerfile -t cortexlabs/tf-base:latest
16+
- docker build . -f images/tf-base-gpu/Dockerfile -t cortexlabs/tf-base-gpu:latest
1617

1718
- ./build/images.sh images/operator operator
1819
- ./build/images.sh images/spark spark
1920
- ./build/images.sh images/spark-operator spark-operator
2021
- ./build/images.sh images/tf-train tf-train
2122
- ./build/images.sh images/tf-serve tf-serve
2223
- ./build/images.sh images/tf-api tf-api
24+
- ./build/images.sh images/tf-serve-gpu tf-serve-gpu
25+
- ./build/images.sh images/tf-train-gpu tf-train-gpu
2326
- ./build/images.sh images/nginx-controller nginx-controller
2427
- ./build/images.sh images/nginx-backend nginx-backend
2528
- ./build/images.sh images/fluentd fluentd

cortex.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORT
147147
export CORTEX_IMAGE_TF_TRAIN="${CORTEX_IMAGE_TF_TRAIN:-cortexlabs/tf-train:$CORTEX_VERSION_STABLE}"
148148
export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}"
149149
export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/python-packager:$CORTEX_VERSION_STABLE}"
150+
export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
151+
export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"
150152

151153
export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}"
152154
export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}"
@@ -291,6 +293,8 @@ function setup_configmap() {
291293
--from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \
292294
--from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \
293295
--from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \
296+
--from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \
297+
--from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \
294298
-o yaml --dry-run | kubectl apply -f - >/dev/null
295299
}
296300

dev/eks.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,12 @@ function eks_set_cluster() {
2626
}
2727

2828
if [ "$1" = "start" ]; then
29-
eksctl create cluster --version=1.11 --name=$K8S_NAME --region $K8S_REGION --nodes=$K8S_NODE_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
29+
eksctl create cluster --version=1.11 --name=$K8S_NAME --region $K8S_REGION --nodes-max $K8S_NODES_MAX_COUNT --nodes-min $K8S_NODES_MIN_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
30+
if [ $K8S_GPU_NODES_MIN_COUNT -gt 0 ] || [ $K8S_GPU_NODES_MAX_COUNT -gt 0 ]; then
31+
eksctl create nodegroup --version=1.11 --cluster=$K8S_NAME --nodes-max=$K8S_GPU_NODES_MAX_COUNT --nodes-min=$K8S_GPU_NODES_MIN_COUNT --node-type=$K8S_GPU_NODE_INSTANCE_TYPE --node-ami=$K8S_GPU_NODE_AMI
32+
echo "Once the GPU nodegroup joins the cluster, run:"
33+
echo "kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml"
34+
fi
3035
eks_set_cluster
3136

3237
elif [ "$1" = "update" ]; then

dev/kops.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ spec:
131131
image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17
132132
machineType: ${K8S_NODE_INSTANCE_TYPE}
133133
rootVolumeSize: ${K8S_NODE_VOLUME_SIZE}
134-
maxSize: ${K8S_NODE_COUNT}
135-
minSize: ${K8S_NODE_COUNT}
134+
maxSize: ${K8S_NODES_MAX_COUNT}
135+
minSize: ${K8S_NODES_MIN_COUNT}
136136
nodeLabels:
137137
kops.k8s.io/instancegroup: nodes
138138
role: Node

dev/registry.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ function create_registry() {
4747
aws ecr create-repository --repository-name=cortexlabs/tf-train --region=$REGISTRY_REGION || true
4848
aws ecr create-repository --repository-name=cortexlabs/tf-api --region=$REGISTRY_REGION || true
4949
aws ecr create-repository --repository-name=cortexlabs/python-packager --region=$REGISTRY_REGION || true
50+
aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
51+
aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
5052
}
5153

5254
### HELPERS ###
@@ -115,6 +117,7 @@ elif [ "$cmd" = "update" ]; then
115117
cache_builder $ROOT/images/spark-base spark-base
116118
build_base $ROOT/images/spark-base spark-base
117119
build_base $ROOT/images/tf-base tf-base
120+
build_base $ROOT/images/tf-base-gpu tf-base-gpu
118121

119122
cache_builder $ROOT/images/operator operator
120123
build_and_push $ROOT/images/operator operator latest
@@ -128,11 +131,13 @@ elif [ "$cmd" = "update" ]; then
128131
build_and_push $ROOT/images/argo-controller argo-controller latest
129132
build_and_push $ROOT/images/argo-executor argo-executor latest
130133
build_and_push $ROOT/images/tf-serve tf-serve latest
134+
build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
131135
build_and_push $ROOT/images/python-packager python-packager latest
132136
fi
133137

134138
build_and_push $ROOT/images/spark spark latest
135139
build_and_push $ROOT/images/tf-train tf-train latest
140+
build_and_push $ROOT/images/tf-train-gpu tf-train-gpu latest
136141
build_and_push $ROOT/images/tf-api tf-api latest
137142

138143
cleanup

docs/applications/advanced/compute.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ For example:
1010
compute:
1111
cpu: "2"
1212
mem: "1Gi"
13+
gpu: 1
1314
```
1415
1516
CPU and memory requests in Cortex correspond to compute resource requests in Kubernetes. In the example above, the training job will only be scheduled once 2 CPUs and 1Gi of memory are available, and the job will be guaranteed to have access to those resources throughout it's execution. In some cases, a Cortex compute resource request can be (or may default to) `Null`.
@@ -21,3 +22,9 @@ One unit of CPU corresponds to one virtual CPU on AWS. Fractional requests are a
2122
## Memory
2223

2324
One unit of memory is one byte. Memory can be expressed as an integer or by using one of these suffixes: `K`, `M`, `G`, `T` (or their power-of two counterparts: `Ki`, `Mi`, `Gi`, `Ti`). For example, the following values represent roughly the same memory: `128974848`, `129e6`, `129M`, `123Mi`.
25+
26+
## GPU
27+
One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html).
28+
29+
## GPU Support
30+
We recommend using GPU compute requests on API resources only if you have enough nodes in your cluster to support the number of GPU requests in model training plus APIs (ideally with an autoscaler). Otherwise, due to the nature of zero downtime rolling updates, your model training will not have sufficient GPU resources as there will always be GPUs consumed by APIs from the previous deployment.

docs/applications/resources/apis.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Serve models at scale and use them to build smarter applications.
1212
replicas: <int> # number of replicas to launch (default: 1)
1313
cpu: <string> # CPU request (default: Null)
1414
mem: <string> # memory request (default: Null)
15+
gpu: <string> # gpu request (default: Null)
1516
tags:
1617
<string>: <scalar> # arbitrary key/value pairs to attach to the resource (optional)
1718
...

docs/applications/resources/models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Train custom TensorFlow models at scale.
4444
compute:
4545
cpu: <string> # CPU request (default: Null)
4646
mem: <string> # memory request (default: Null)
47+
gpu: <string> # GPU request (default: Null)
4748

4849
tags:
4950
<string>: <scalar> # arbitrary key/value pairs to attach to the resource (optional)

docs/applications/resources/statuses.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
| terminated | Resource was terminated |
1313
| upstream error | Resource was not created due to an error in one of its dependencies |
1414
| upstream termination | Resource was not created because one of its dependencies was terminated |
15-
| compute unavailable | Resource's workload could not start due to insufficient memory or CPU in the cluster |
15+
| compute unavailable | Resource's workload could not start due to insufficient memory, CPU or GPU in the cluster |
1616

1717
## API statuses
1818

@@ -29,4 +29,4 @@
2929
| update skipped | API was not updated due to an error in another resource; a previous version of this API is ready |
3030
| upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready |
3131
| upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready |
32-
| compute unavailable | API could not start due to insufficient memory or CPU in the cluster; some replicas may be ready |
32+
| compute unavailable | API could not start due to insufficient memory, CPU or GPU in the cluster; some replicas may be ready |

images/python-packager/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM ubuntu:18.04
1+
FROM ubuntu:16.04
22

33
RUN apt-get update -qq && apt-get install -y -q \
44
python3 \

images/spark-base/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM ubuntu:18.04 as builder
1+
FROM ubuntu:16.04 as builder
22

33
RUN apt-get update -qq && apt-get install -y -q \
44
git \
@@ -47,7 +47,7 @@ RUN wget -q -P $SPARK_HOME/jars/ http://central.maven.org/maven2/com/amazonaws/a
4747
COPY images/spark-base/conf/* $SPARK_HOME/conf/
4848

4949

50-
FROM ubuntu:18.04
50+
FROM ubuntu:16.04
5151

5252
ENV JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"
5353
ENV HADOOP_HOME="/opt/hadoop"

images/tf-base-gpu/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
FROM tensorflow/tensorflow:1.12.0-gpu-py3
2+
3+
RUN apt-get update -qq && apt-get install -y -q \
4+
zlib1g-dev \
5+
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/*

images/tf-base/Dockerfile

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,6 @@
1-
FROM ubuntu:18.04
2-
3-
ARG TF_VERSION="1.12.0"
1+
FROM tensorflow/tensorflow:1.12.0-py3
42

53
RUN apt-get update -qq && apt-get install -y -q \
6-
build-essential \
7-
curl \
8-
libfreetype6-dev \
9-
libpng-dev \
10-
libzmq3-dev \
11-
pkg-config \
12-
python3 \
13-
python3-dev \
14-
python3-pip \
15-
rsync \
16-
software-properties-common \
17-
unzip \
184
zlib1g-dev \
195
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/*
206

21-
RUN pip3 install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-${TF_VERSION}-cp36-cp36m-linux_x86_64.whl && rm -rf /root/.cache/pip*

images/tf-serve-gpu/Dockerfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
FROM cortexlabs/tf-base-gpu
2+
3+
ARG TF_VERSION="1.12.0"
4+
5+
RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb
6+
RUN dpkg -i tensorflow-model-server.deb
7+
8+
ENTRYPOINT ["tensorflow_model_server"]

images/tf-serve/Dockerfile

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,9 @@
1-
FROM ubuntu:18.04
1+
FROM cortexlabs/tf-base
22

33
ARG TF_VERSION="1.12.0"
44

55
RUN apt-get update -qq && apt-get install -y -q \
6-
automake \
7-
build-essential \
86
curl \
9-
libcurl3-dev \
10-
git \
11-
libtool \
12-
libfreetype6-dev \
13-
libpng-dev \
14-
libzmq3-dev \
15-
pkg-config \
16-
python3-dev \
17-
python3-numpy \
18-
python3-pip \
19-
software-properties-common \
20-
swig \
21-
zip \
22-
zlib1g-dev \
237
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/*
248

259
RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb

images/tf-train-gpu/Dockerfile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM cortexlabs/tf-base-gpu
2+
3+
ENV PYTHONPATH="/src:${PYTHONPATH}"
4+
5+
COPY pkg/workloads/lib/requirements.txt /src/lib/requirements.txt
6+
COPY pkg/workloads/tf_train/requirements.txt /src/tf_train/requirements.txt
7+
RUN pip3 install -r /src/lib/requirements.txt && \
8+
pip3 install -r /src/tf_train/requirements.txt && \
9+
rm -rf /root/.cache/pip*
10+
11+
COPY pkg/workloads/consts.py /src/
12+
COPY pkg/workloads/lib /src/lib
13+
COPY pkg/workloads/tf_train /src/tf_train
14+
15+
ENTRYPOINT ["/usr/bin/python3", "/src/tf_train/train.py"]

pkg/api/userconfig/compute.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ func (sparkCompute *SparkCompute) ID() string {
142142
type TFCompute struct {
143143
CPU *Quantity `json:"cpu" yaml:"cpu"`
144144
Mem *Quantity `json:"mem" yaml:"mem"`
145+
GPU *int64 `json:"gpu" yaml:"gpu"`
145146
}
146147

147148
var tfComputeFieldValidation = &cr.StructFieldValidation{
@@ -166,6 +167,13 @@ var tfComputeFieldValidation = &cr.StructFieldValidation{
166167
Min: k8sresource.MustParse("0"),
167168
}),
168169
},
170+
&cr.StructFieldValidation{
171+
StructField: "GPU",
172+
Int64PtrValidation: &cr.Int64PtrValidation{
173+
Default: nil,
174+
GreaterThan: util.Int64Ptr(0),
175+
},
176+
},
169177
},
170178
},
171179
}
@@ -181,6 +189,7 @@ type APICompute struct {
181189
Replicas int32 `json:"replicas" yaml:"replicas"`
182190
CPU *Quantity `json:"cpu" yaml:"cpu"`
183191
Mem *Quantity `json:"mem" yaml:"mem"`
192+
GPU int64 `json:"gpu" yaml:"gpu"`
184193
}
185194

186195
var apiComputeFieldValidation = &cr.StructFieldValidation{
@@ -212,6 +221,13 @@ var apiComputeFieldValidation = &cr.StructFieldValidation{
212221
Min: k8sresource.MustParse("0"),
213222
}),
214223
},
224+
&cr.StructFieldValidation{
225+
StructField: "GPU",
226+
Int64Validation: &cr.Int64Validation{
227+
Default: 0,
228+
GreaterThanOrEqualTo: util.Int64Ptr(0),
229+
},
230+
},
215231
},
216232
},
217233
}
@@ -221,13 +237,15 @@ func (apiCompute *APICompute) ID() string {
221237
buf.WriteString(s.Int32(apiCompute.Replicas))
222238
buf.WriteString(QuantityPtrID(apiCompute.CPU))
223239
buf.WriteString(QuantityPtrID(apiCompute.Mem))
240+
buf.WriteString(s.Int64(apiCompute.GPU))
224241
return util.HashBytes(buf.Bytes())
225242
}
226243

227244
func (apiCompute *APICompute) IDWithoutReplicas() string {
228245
var buf bytes.Buffer
229246
buf.WriteString(QuantityPtrID(apiCompute.CPU))
230247
buf.WriteString(QuantityPtrID(apiCompute.Mem))
248+
buf.WriteString(s.Int64(apiCompute.GPU))
231249
return util.HashBytes(buf.Bytes())
232250
}
233251

@@ -284,6 +302,11 @@ func MaxTFCompute(tfComputes ...*TFCompute) *TFCompute {
284302
aggregated.Mem = tfCompute.Mem
285303
}
286304
}
305+
if tfCompute.GPU != nil {
306+
if aggregated.GPU == nil || *tfCompute.GPU > *aggregated.GPU {
307+
aggregated.GPU = tfCompute.GPU
308+
}
309+
}
287310
}
288311

289312
return &aggregated
@@ -299,5 +322,10 @@ func (apiCompute *APICompute) Equal(apiCompute2 APICompute) bool {
299322
if !QuantityPtrsEqual(apiCompute.Mem, apiCompute2.Mem) {
300323
return false
301324
}
325+
326+
if apiCompute.GPU != apiCompute2.GPU {
327+
return false
328+
}
329+
302330
return true
303331
}

pkg/operator/cortexconfig/cortex_config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ var (
3434
TFServeImage string
3535
TFAPIImage string
3636
PythonPackagerImage string
37+
TFTrainImageGPU string
38+
TFServeImageGPU string
3739
)
3840

3941
func init() {
@@ -47,6 +49,8 @@ func init() {
4749
TFServeImage = getStr("IMAGE_TF_SERVE")
4850
TFAPIImage = getStr("IMAGE_TF_API")
4951
PythonPackagerImage = getStr("IMAGE_PYTHON_PACKAGER")
52+
TFTrainImageGPU = getStr("IMAGE_TF_TRAIN_GPU")
53+
TFServeImageGPU = getStr("IMAGE_TF_SERVE_GPU")
5054
}
5155

5256
//

0 commit comments

Comments
 (0)