GPU support (#6)

1vn · web-flow · commit cff9fac178df · 2019-02-15T17:55:11.000-05:00
* add GPU support

* separate GPU docker images

* add gpu to compute docs

* update dockerfiles - remove license and use tf image directly

* use ubuntu 16
diff --git a/.travis.yml b/.travis.yml
@@ -13,13 +13,16 @@ services:
 script:
   - docker build . -f images/spark-base/Dockerfile -t cortexlabs/spark-base:latest
   - docker build . -f images/tf-base/Dockerfile -t cortexlabs/tf-base:latest
+  - docker build . -f images/tf-base-gpu/Dockerfile -t cortexlabs/tf-base-gpu:latest
 
   - ./build/images.sh images/operator operator
   - ./build/images.sh images/spark spark
   - ./build/images.sh images/spark-operator spark-operator
   - ./build/images.sh images/tf-train tf-train
   - ./build/images.sh images/tf-serve tf-serve
   - ./build/images.sh images/tf-api tf-api
+  - ./build/images.sh images/tf-serve-gpu tf-serve-gpu
+  - ./build/images.sh images/tf-train-gpu tf-train-gpu
   - ./build/images.sh images/nginx-controller nginx-controller
   - ./build/images.sh images/nginx-backend nginx-backend
   - ./build/images.sh images/fluentd fluentd
diff --git a/cortex.sh b/cortex.sh
@@ -147,6 +147,8 @@ export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORT
 export CORTEX_IMAGE_TF_TRAIN="${CORTEX_IMAGE_TF_TRAIN:-cortexlabs/tf-train:$CORTEX_VERSION_STABLE}"
 export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}"
 export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/python-packager:$CORTEX_VERSION_STABLE}"
+export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
+export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"
 
 export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}"
 export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}"
@@ -291,6 +293,8 @@ function setup_configmap() {
     --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \
     --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \
     --from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \
+    --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \
+    --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \
     -o yaml --dry-run | kubectl apply -f - >/dev/null
 }
 
diff --git a/dev/eks.sh b/dev/eks.sh
@@ -26,7 +26,12 @@ function eks_set_cluster() {
 }
 
 if [ "$1" = "start" ]; then
-  eksctl create cluster --version=1.11 --name=$K8S_NAME  --region $K8S_REGION --nodes=$K8S_NODE_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
+  eksctl create cluster --version=1.11 --name=$K8S_NAME  --region $K8S_REGION --nodes-max $K8S_NODES_MAX_COUNT --nodes-min $K8S_NODES_MIN_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
+  if [ $K8S_GPU_NODES_MIN_COUNT -gt 0 ] || [ $K8S_GPU_NODES_MAX_COUNT -gt 0 ]; then
+    eksctl create nodegroup --version=1.11 --cluster=$K8S_NAME --nodes-max=$K8S_GPU_NODES_MAX_COUNT --nodes-min=$K8S_GPU_NODES_MIN_COUNT  --node-type=$K8S_GPU_NODE_INSTANCE_TYPE --node-ami=$K8S_GPU_NODE_AMI
+    echo "Once the GPU nodegroup joins the cluster, run:"
+    echo "kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml"
+  fi
   eks_set_cluster
 
 elif [ "$1" = "update" ]; then
diff --git a/dev/kops.sh b/dev/kops.sh
@@ -131,8 +131,8 @@ spec:
   image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17
   machineType: ${K8S_NODE_INSTANCE_TYPE}
   rootVolumeSize: ${K8S_NODE_VOLUME_SIZE}
-  maxSize: ${K8S_NODE_COUNT}
-  minSize: ${K8S_NODE_COUNT}
+  maxSize: ${K8S_NODES_MAX_COUNT}
+  minSize: ${K8S_NODES_MIN_COUNT}
   nodeLabels:
     kops.k8s.io/instancegroup: nodes
   role: Node
diff --git a/dev/registry.sh b/dev/registry.sh
@@ -47,6 +47,8 @@ function create_registry() {
   aws ecr create-repository --repository-name=cortexlabs/tf-train --region=$REGISTRY_REGION || true
   aws ecr create-repository --repository-name=cortexlabs/tf-api --region=$REGISTRY_REGION || true
   aws ecr create-repository --repository-name=cortexlabs/python-packager --region=$REGISTRY_REGION || true
+  aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
+  aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
 }
 
 ### HELPERS ###
@@ -115,6 +117,7 @@ elif [ "$cmd" = "update" ]; then
     cache_builder $ROOT/images/spark-base spark-base
     build_base $ROOT/images/spark-base spark-base
     build_base $ROOT/images/tf-base tf-base
+    build_base $ROOT/images/tf-base-gpu tf-base-gpu
 
     cache_builder $ROOT/images/operator operator
     build_and_push $ROOT/images/operator operator latest
@@ -128,11 +131,13 @@ elif [ "$cmd" = "update" ]; then
     build_and_push $ROOT/images/argo-controller argo-controller latest
     build_and_push $ROOT/images/argo-executor argo-executor latest
     build_and_push $ROOT/images/tf-serve tf-serve latest
+    build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
     build_and_push $ROOT/images/python-packager python-packager latest
   fi
 
   build_and_push $ROOT/images/spark spark latest
   build_and_push $ROOT/images/tf-train tf-train latest
+  build_and_push $ROOT/images/tf-train-gpu tf-train-gpu latest
   build_and_push $ROOT/images/tf-api tf-api latest
 
   cleanup
diff --git a/docs/applications/advanced/compute.md b/docs/applications/advanced/compute.md
@@ -10,6 +10,7 @@ For example:
   compute:
     cpu: "2"
     mem: "1Gi"
+    gpu: 1
 ```
 
 CPU and memory requests in Cortex correspond to compute resource requests in Kubernetes. In the example above, the training job will only be scheduled once 2 CPUs and 1Gi of memory are available, and the job will be guaranteed to have access to those resources throughout it's execution. In some cases, a Cortex compute resource request can be (or may default to) `Null`.
@@ -21,3 +22,9 @@ One unit of CPU corresponds to one virtual CPU on AWS. Fractional requests are a
 ## Memory
 
 One unit of memory is one byte. Memory can be expressed as an integer or by using one of these suffixes: `K`, `M`, `G`, `T` (or their power-of two counterparts: `Ki`, `Mi`, `Gi`, `Ti`). For example, the following values represent roughly the same memory: `128974848`, `129e6`, `129M`, `123Mi`.
+
+## GPU
+One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html).
+
+## GPU Support
+We recommend using GPU compute requests on API resources only if you have enough nodes in your cluster to support the number of GPU requests in model training plus APIs (ideally with an autoscaler). Otherwise, due to the nature of zero downtime rolling updates, your model training will not have sufficient GPU resources as there will always be GPUs consumed by APIs from the previous deployment.
diff --git a/docs/applications/resources/apis.md b/docs/applications/resources/apis.md
@@ -12,6 +12,7 @@ Serve models at scale and use them to build smarter applications.
     replicas: <int>  # number of replicas to launch (default: 1)
     cpu: <string>  # CPU request (default: Null)
     mem: <string>  # memory request (default: Null)
+    gpu: <string>  # gpu request (default: Null)
   tags:
     <string>: <scalar>  # arbitrary key/value pairs to attach to the resource (optional)
     ...
diff --git a/docs/applications/resources/models.md b/docs/applications/resources/models.md
@@ -44,6 +44,7 @@ Train custom TensorFlow models at scale.
   compute:
     cpu: <string>  # CPU request (default: Null)
     mem: <string>  # memory request (default: Null)
+    gpu: <string>  # GPU request (default: Null)
 
   tags:
     <string>: <scalar>  # arbitrary key/value pairs to attach to the resource (optional)
diff --git a/docs/applications/resources/statuses.md b/docs/applications/resources/statuses.md
@@ -12,7 +12,7 @@
 | terminated           | Resource was terminated |
 | upstream error       | Resource was not created due to an error in one of its dependencies |
 | upstream termination | Resource was not created because one of its dependencies was terminated |
-| compute unavailable  | Resource's workload could not start due to insufficient memory or CPU in the cluster |
+| compute unavailable  | Resource's workload could not start due to insufficient memory, CPU or GPU in the cluster |
 
 ## API statuses
 
@@ -29,4 +29,4 @@
 | update skipped       | API was not updated due to an error in another resource; a previous version of this API is ready |
 | upstream error       | API was not created due to an error in one of its dependencies; a previous version of this API may be ready |
 | upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready |
-| compute unavailable  | API could not start due to insufficient memory or CPU in the cluster; some replicas may be ready |
+| compute unavailable  | API could not start due to insufficient memory, CPU or GPU in the cluster; some replicas may be ready |
diff --git a/images/python-packager/Dockerfile b/images/python-packager/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:16.04
 
 RUN apt-get update -qq && apt-get install -y -q \
         python3 \
diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04 as builder
+FROM ubuntu:16.04 as builder
 
 RUN apt-get update -qq && apt-get install -y -q \
         git \
@@ -47,7 +47,7 @@ RUN wget -q -P $SPARK_HOME/jars/ http://central.maven.org/maven2/com/amazonaws/a
 COPY images/spark-base/conf/* $SPARK_HOME/conf/
 
 
-FROM ubuntu:18.04
+FROM ubuntu:16.04
 
 ENV JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64"
 ENV HADOOP_HOME="/opt/hadoop"
diff --git a/images/tf-base-gpu/Dockerfile b/images/tf-base-gpu/Dockerfile
@@ -0,0 +1,5 @@
+FROM tensorflow/tensorflow:1.12.0-gpu-py3
+
+RUN apt-get update -qq && apt-get install -y -q \
+        zlib1g-dev \
+    && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
diff --git a/images/tf-base/Dockerfile b/images/tf-base/Dockerfile
@@ -1,21 +1,6 @@
-FROM ubuntu:18.04
-
-ARG TF_VERSION="1.12.0"
+FROM tensorflow/tensorflow:1.12.0-py3
 
 RUN apt-get update -qq && apt-get install -y -q \
-        build-essential \
-        curl \
-        libfreetype6-dev \
-        libpng-dev \
-        libzmq3-dev \
-        pkg-config \
-        python3 \
-        python3-dev \
-        python3-pip \
-        rsync \
-        software-properties-common \
-        unzip \
         zlib1g-dev \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
-RUN pip3 install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-${TF_VERSION}-cp36-cp36m-linux_x86_64.whl && rm -rf /root/.cache/pip*
diff --git a/images/tf-serve-gpu/Dockerfile b/images/tf-serve-gpu/Dockerfile
@@ -0,0 +1,8 @@
+FROM cortexlabs/tf-base-gpu
+
+ARG TF_VERSION="1.12.0"
+
+RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb
+RUN dpkg -i tensorflow-model-server.deb
+
+ENTRYPOINT ["tensorflow_model_server"]
diff --git a/images/tf-serve/Dockerfile b/images/tf-serve/Dockerfile
@@ -1,25 +1,9 @@
-FROM ubuntu:18.04
+FROM cortexlabs/tf-base
 
 ARG TF_VERSION="1.12.0"
 
 RUN apt-get update -qq && apt-get install -y -q \
-        automake \
-        build-essential \
         curl \
-        libcurl3-dev \
-        git \
-        libtool \
-        libfreetype6-dev \
-        libpng-dev \
-        libzmq3-dev \
-        pkg-config \
-        python3-dev \
-        python3-numpy \
-        python3-pip \
-        software-properties-common \
-        swig \
-        zip \
-        zlib1g-dev \
     && apt-get clean -qq && rm -rf /var/lib/apt/lists/*
 
 RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb
diff --git a/images/tf-train-gpu/Dockerfile b/images/tf-train-gpu/Dockerfile
@@ -0,0 +1,15 @@
+FROM cortexlabs/tf-base-gpu
+
+ENV PYTHONPATH="/src:${PYTHONPATH}"
+
+COPY pkg/workloads/lib/requirements.txt /src/lib/requirements.txt
+COPY pkg/workloads/tf_train/requirements.txt /src/tf_train/requirements.txt
+RUN pip3 install -r /src/lib/requirements.txt && \
+    pip3 install -r /src/tf_train/requirements.txt && \
+    rm -rf /root/.cache/pip*
+
+COPY pkg/workloads/consts.py /src/
+COPY pkg/workloads/lib /src/lib
+COPY pkg/workloads/tf_train /src/tf_train
+
+ENTRYPOINT ["/usr/bin/python3", "/src/tf_train/train.py"]
diff --git a/pkg/api/userconfig/compute.go b/pkg/api/userconfig/compute.go
@@ -142,6 +142,7 @@ func (sparkCompute *SparkCompute) ID() string {
 type TFCompute struct {
 	CPU *Quantity `json:"cpu" yaml:"cpu"`
 	Mem *Quantity `json:"mem" yaml:"mem"`
+	GPU *int64    `json:"gpu" yaml:"gpu"`
 }
 
 var tfComputeFieldValidation = &cr.StructFieldValidation{
@@ -166,6 +167,13 @@ var tfComputeFieldValidation = &cr.StructFieldValidation{
 					Min: k8sresource.MustParse("0"),
 				}),
 			},
+			&cr.StructFieldValidation{
+				StructField: "GPU",
+				Int64PtrValidation: &cr.Int64PtrValidation{
+					Default:     nil,
+					GreaterThan: util.Int64Ptr(0),
+				},
+			},
 		},
 	},
 }
@@ -181,6 +189,7 @@ type APICompute struct {
 	Replicas int32     `json:"replicas" yaml:"replicas"`
 	CPU      *Quantity `json:"cpu" yaml:"cpu"`
 	Mem      *Quantity `json:"mem" yaml:"mem"`
+	GPU      int64     `json:"gpu" yaml:"gpu"`
 }
 
 var apiComputeFieldValidation = &cr.StructFieldValidation{
@@ -212,6 +221,13 @@ var apiComputeFieldValidation = &cr.StructFieldValidation{
 					Min: k8sresource.MustParse("0"),
 				}),
 			},
+			&cr.StructFieldValidation{
+				StructField: "GPU",
+				Int64Validation: &cr.Int64Validation{
+					Default:              0,
+					GreaterThanOrEqualTo: util.Int64Ptr(0),
+				},
+			},
 		},
 	},
 }
@@ -221,13 +237,15 @@ func (apiCompute *APICompute) ID() string {
 	buf.WriteString(s.Int32(apiCompute.Replicas))
 	buf.WriteString(QuantityPtrID(apiCompute.CPU))
 	buf.WriteString(QuantityPtrID(apiCompute.Mem))
+	buf.WriteString(s.Int64(apiCompute.GPU))
 	return util.HashBytes(buf.Bytes())
 }
 
 func (apiCompute *APICompute) IDWithoutReplicas() string {
 	var buf bytes.Buffer
 	buf.WriteString(QuantityPtrID(apiCompute.CPU))
 	buf.WriteString(QuantityPtrID(apiCompute.Mem))
+	buf.WriteString(s.Int64(apiCompute.GPU))
 	return util.HashBytes(buf.Bytes())
 }
 
@@ -284,6 +302,11 @@ func MaxTFCompute(tfComputes ...*TFCompute) *TFCompute {
 				aggregated.Mem = tfCompute.Mem
 			}
 		}
+		if tfCompute.GPU != nil {
+			if aggregated.GPU == nil || *tfCompute.GPU > *aggregated.GPU {
+				aggregated.GPU = tfCompute.GPU
+			}
+		}
 	}
 
 	return &aggregated
@@ -299,5 +322,10 @@ func (apiCompute *APICompute) Equal(apiCompute2 APICompute) bool {
 	if !QuantityPtrsEqual(apiCompute.Mem, apiCompute2.Mem) {
 		return false
 	}
+
+	if apiCompute.GPU != apiCompute2.GPU {
+		return false
+	}
+
 	return true
 }
diff --git a/pkg/operator/cortexconfig/cortex_config.go b/pkg/operator/cortexconfig/cortex_config.go
@@ -34,6 +34,8 @@ var (
 	TFServeImage        string
 	TFAPIImage          string
 	PythonPackagerImage string
+	TFTrainImageGPU     string
+	TFServeImageGPU     string
 )
 
 func init() {
@@ -47,6 +49,8 @@ func init() {
 	TFServeImage = getStr("IMAGE_TF_SERVE")
 	TFAPIImage = getStr("IMAGE_TF_API")
 	PythonPackagerImage = getStr("IMAGE_PYTHON_PACKAGER")
+	TFTrainImageGPU = getStr("IMAGE_TF_TRAIN_GPU")
+	TFServeImageGPU = getStr("IMAGE_TF_SERVE_GPU")
 }
 
 //
diff --git a/pkg/operator/workloads/api.go b/pkg/operator/workloads/api.go
diff --git a/pkg/operator/workloads/api_status.go b/pkg/operator/workloads/api_status.go
diff --git a/pkg/operator/workloads/training_job.go b/pkg/operator/workloads/training_job.go
diff --git a/pkg/workloads/lib/aws.py b/pkg/workloads/lib/aws.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM ubuntu:18.04`
	`1`	`+FROM ubuntu:16.04`
`2`	`2`
`3`	`3`	`RUN apt-get update -qq && apt-get install -y -q \`
`4`	`4`	`python3 \`