Skip to content

Commit 1a7e844

Browse files
1vndeliahu
authored andcommitted
Make TF model paths easier, update download init container (#373)
1 parent 601dd1e commit 1a7e844

File tree

19 files changed

+184
-90
lines changed

19 files changed

+184
-90
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ ci-build-images:
154154
@./build/build-image.sh images/istio-galley istio-galley
155155
@./build/build-image.sh images/istio-pilot istio-pilot
156156
@./build/build-image.sh images/istio-proxy istio-proxy
157+
@./build/build-image.sh images/downloader downloader
157158

158159
ci-push-images:
159160
@./build/push-image.sh manager
@@ -172,6 +173,7 @@ ci-push-images:
172173
@./build/push-image.sh istio-galley
173174
@./build/push-image.sh istio-pilot
174175
@./build/push-image.sh istio-proxy
176+
@./build/push-image.sh downloader
175177

176178

177179
ci-build-cli:

cortex.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ export CORTEX_IMAGE_ISTIO_CITADEL="${CORTEX_IMAGE_ISTIO_CITADEL:-cortexlabs/isti
161161
export CORTEX_IMAGE_ISTIO_GALLEY="${CORTEX_IMAGE_ISTIO_GALLEY:-cortexlabs/istio-galley:$CORTEX_VERSION_STABLE}"
162162
export CORTEX_IMAGE_ISTIO_PILOT="${CORTEX_IMAGE_ISTIO_PILOT:-cortexlabs/istio-pilot:$CORTEX_VERSION_STABLE}"
163163
export CORTEX_IMAGE_ISTIO_PROXY="${CORTEX_IMAGE_ISTIO_PROXY:-cortexlabs/istio-proxy:$CORTEX_VERSION_STABLE}"
164+
export CORTEX_IMAGE_DOWNLOADER="${CORTEX_IMAGE_DOWNLOADER:-cortexlabs/downloader:$CORTEX_VERSION_STABLE}"
164165

165166
export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}"
166167
export CORTEX_TELEMETRY_URL="${CORTEX_TELEMETRY_URL:-"https://telemetry.cortexlabs.dev"}"
@@ -220,6 +221,7 @@ function install_cortex() {
220221
-e CORTEX_IMAGE_ISTIO_GALLEY=$CORTEX_IMAGE_ISTIO_GALLEY \
221222
-e CORTEX_IMAGE_ISTIO_PILOT=$CORTEX_IMAGE_ISTIO_PILOT \
222223
-e CORTEX_IMAGE_ISTIO_PROXY=$CORTEX_IMAGE_ISTIO_PROXY \
224+
-e CORTEX_IMAGE_DOWNLOADER=$CORTEX_IMAGE_DOWNLOADER \
223225
-e CORTEX_ENABLE_TELEMETRY=$CORTEX_ENABLE_TELEMETRY \
224226
$CORTEX_IMAGE_MANAGER
225227
}

dev/registry.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ function create_registry() {
5151
aws ecr create-repository --repository-name=cortexlabs/cluster-autoscaler --region=$REGISTRY_REGION || true
5252
aws ecr create-repository --repository-name=cortexlabs/nvidia --region=$REGISTRY_REGION || true
5353
aws ecr create-repository --repository-name=cortexlabs/metrics-server --region=$REGISTRY_REGION || true
54+
aws ecr create-repository --repository-name=cortexlabs/downloader --region=$REGISTRY_REGION || true
5455
}
5556

5657
### HELPERS ###
@@ -139,6 +140,7 @@ elif [ "$cmd" = "update" ]; then
139140
fi
140141

141142
build_and_push $ROOT/images/tf-api tf-api latest
143+
build_and_push $ROOT/images/downloader downloader latest
142144
build_and_push $ROOT/images/onnx-serve onnx-serve latest
143145

144146
cleanup

docs/cluster/config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ export CORTEX_IMAGE_ISTIO_PROXY="cortexlabs/istio-proxy:master"
5959
export CORTEX_IMAGE_ISTIO_PILOT="cortexlabs/istio-pilot:master"
6060
export CORTEX_IMAGE_ISTIO_CITADEL="cortexlabs/istio-citadel:master"
6161
export CORTEX_IMAGE_ISTIO_GALLEY="cortexlabs/istio-galley:master"
62+
export CORTEX_IMAGE_DOWNLOADER="cortexlabs/downloader:master"
6263

6364
# Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted.
6465
export CORTEX_ENABLE_TELEMETRY=""

docs/cluster/development.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ export CORTEX_IMAGE_ISTIO_PROXY="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortex
7777
export CORTEX_IMAGE_ISTIO_PILOT="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/istio-pilot:latest"
7878
export CORTEX_IMAGE_ISTIO_CITADEL="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/istio-citadel:latest"
7979
export CORTEX_IMAGE_ISTIO_GALLEY="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/istio-galley:latest"
80+
export CORTEX_IMAGE_DOWNLOADER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/downloader:latest"
8081

8182
export CORTEX_ENABLE_TELEMETRY="false"
8283
```

docs/deployments/packaging-models.md

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,38 +2,43 @@
22

33
## TensorFlow
44

5-
Export your trained model and zip the model directory. An example is shown below (here is the [complete example](https://github.com/cortexlabs/cortex/blob/master/examples/iris/models/tensorflow_model.py)):
5+
Export your trained model and upload the export directory, or checkpoint directory containing the export directory, which is usually the case if you used `estimator.train_and_evaluate`. An example is shown below (here is the [complete example](https://github.com/cortexlabs/cortex/blob/master/examples/sentiment)):
66

77
```Python
88
import tensorflow as tf
99
import shutil
1010
import os
1111

1212
...
13-
14-
classifier = tf.estimator.Estimator(
15-
model_fn=my_model, model_dir="iris", params={"hidden_units": [10, 10], "n_classes": 3}
16-
)
17-
18-
exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False)
19-
train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=1000)
20-
eval_spec = tf.estimator.EvalSpec(eval_input_fn, exporters=[exporter], name="estimator-eval")
21-
22-
tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
13+
OUPUT_DIR="bert"
14+
estimator = tf.estimator.Estimator(model_fn=model_fn...)
15+
16+
# TF Serving requires a special input_fn used at serving time
17+
def serving_input_fn():
18+
inputs = tf.placeholder(shape=[128], dtype=tf.int32)
19+
features = {
20+
"input_ids": tf.expand_dims(inputs, 0),
21+
"input_mask": tf.expand_dims(inputs, 0),
22+
"segment_ids": tf.expand_dims(inputs, 0),
23+
"label_ids": tf.placeholder(shape=[0], dtype=tf.int32),
24+
}
25+
return tf.estimator.export.ServingInputReceiver(features=features, receiver_tensors=inputs)
26+
27+
estimator.export_savedmodel(OUPUT_DIR, serving_input_fn, strip_default_attrs=True)
2328
```
2429

25-
Upload the exported version directory to Amazon S3 using the AWS web console or CLI:
30+
Upload the checkpoint directory to Amazon S3 using the AWS web console or CLI:
2631

2732
```text
28-
$ aws s3 sync ./iris/export/estimator/156293432 s3://my-bucket/iris/156293432
33+
$ aws s3 sync ./bert s3://my-bucket/bert
2934
```
3035

3136
Reference your model in an `api`:
3237

3338
```yaml
3439
- kind: api
3540
name: my-api
36-
model: s3://my-bucket/iris/156293432
41+
model: s3://my-bucket/bert
3742
```
3843
3944
## ONNX

examples/image-classifier/cortex.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@
33

44
- kind: api
55
name: classifier
6-
model: s3://cortex-examples/imagenet/1566492692
6+
model: s3://cortex-examples/imagenet/
77
request_handler: imagenet.py

images/downloader/Dockerfile

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
FROM ubuntu:18.04
2+
3+
ENV PYTHONPATH="/src:${PYTHONPATH}"
4+
5+
RUN apt-get update -qq && apt-get install -y -q \
6+
python3 \
7+
python3-dev \
8+
python3-pip \
9+
&& apt-get clean -qq && rm -rf /var/lib/apt/lists/* && \
10+
pip3 install --upgrade \
11+
pip \
12+
setuptools \
13+
&& rm -rf /root/.cache/pip*
14+
15+
COPY pkg/workloads/cortex/lib/requirements.txt /src/cortex/lib/requirements.txt
16+
RUN pip3 install -r /src/cortex/lib/requirements.txt && \
17+
rm -rf /root/.cache/pip*
18+
19+
COPY pkg/workloads/cortex/consts.py /src/cortex/
20+
COPY pkg/workloads/cortex/lib /src/cortex/lib
21+
COPY pkg/workloads/cortex/downloader /src/cortex/downloader
22+
23+
ENTRYPOINT ["/usr/bin/python3", "/src/cortex/downloader/download.py"]

manager/install_cortex.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ function setup_configmap() {
6464
--from-literal='IMAGE_ONNX_SERVE'=$CORTEX_IMAGE_ONNX_SERVE \
6565
--from-literal='IMAGE_ONNX_SERVE_GPU'=$CORTEX_IMAGE_ONNX_SERVE_GPU \
6666
--from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \
67+
--from-literal='IMAGE_DOWNLOADER'=$CORTEX_IMAGE_DOWNLOADER \
6768
--from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \
6869
--from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \
6970
--from-literal='ENABLE_TELEMETRY'=$CORTEX_ENABLE_TELEMETRY \

pkg/lib/aws/aws.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import (
3131
type Client struct {
3232
Region string
3333
Bucket string
34-
s3Client *s3.S3
34+
S3 *s3.S3
3535
stsClient *sts.STS
3636
cloudWatchLogsClient *cloudwatchlogs.CloudWatchLogs
3737
CloudWatchMetrics *cloudwatch.CloudWatch
@@ -48,7 +48,7 @@ func New(region string, bucket string, withAccountID bool) (*Client, error) {
4848
awsClient := &Client{
4949
Bucket: bucket,
5050
Region: region,
51-
s3Client: s3.New(sess),
51+
S3: s3.New(sess),
5252
stsClient: sts.New(sess),
5353
CloudWatchMetrics: cloudwatch.New(sess),
5454
cloudWatchLogsClient: cloudwatchlogs.New(sess),

pkg/lib/aws/s3.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func S3PathJoin(paths ...string) string {
7070

7171
func (c *Client) IsS3File(keys ...string) (bool, error) {
7272
for _, key := range keys {
73-
_, err := c.s3Client.HeadObject(&s3.HeadObjectInput{
73+
_, err := c.S3.HeadObject(&s3.HeadObjectInput{
7474
Bucket: aws.String(c.Bucket),
7575
Key: aws.String(key),
7676
})
@@ -88,7 +88,7 @@ func (c *Client) IsS3File(keys ...string) (bool, error) {
8888

8989
func (c *Client) IsS3Prefix(prefixes ...string) (bool, error) {
9090
for _, prefix := range prefixes {
91-
out, err := c.s3Client.ListObjectsV2(&s3.ListObjectsV2Input{
91+
out, err := c.S3.ListObjectsV2(&s3.ListObjectsV2Input{
9292
Bucket: aws.String(c.Bucket),
9393
Prefix: aws.String(prefix),
9494
})
@@ -138,7 +138,7 @@ func (c *Client) IsS3PathDir(s3Paths ...string) (bool, error) {
138138
}
139139

140140
func (c *Client) UploadBytesToS3(data []byte, key string) error {
141-
_, err := c.s3Client.PutObject(&s3.PutObjectInput{
141+
_, err := c.S3.PutObject(&s3.PutObjectInput{
142142
Body: bytes.NewReader(data),
143143
Key: aws.String(key),
144144
Bucket: aws.String(c.Bucket),
@@ -210,7 +210,7 @@ func (c *Client) ReadMsgpackFromS3(objPtr interface{}, key string) error {
210210
}
211211

212212
func (c *Client) ReadStringFromS3(key string) (string, error) {
213-
response, err := c.s3Client.GetObject(&s3.GetObjectInput{
213+
response, err := c.S3.GetObject(&s3.GetObjectInput{
214214
Key: aws.String(key),
215215
Bucket: aws.String(c.Bucket),
216216
})
@@ -225,7 +225,7 @@ func (c *Client) ReadStringFromS3(key string) (string, error) {
225225
}
226226

227227
func (c *Client) ReadBytesFromS3(key string) ([]byte, error) {
228-
response, err := c.s3Client.GetObject(&s3.GetObjectInput{
228+
response, err := c.S3.GetObject(&s3.GetObjectInput{
229229
Key: aws.String(key),
230230
Bucket: aws.String(c.Bucket),
231231
})
@@ -246,7 +246,7 @@ func (c *Client) ListPrefix(prefix string, maxResults int64) ([]*s3.Object, erro
246246
MaxKeys: aws.Int64(maxResults),
247247
}
248248

249-
output, err := c.s3Client.ListObjectsV2(listObjectsInput)
249+
output, err := c.S3.ListObjectsV2(listObjectsInput)
250250
if err != nil {
251251
return nil, errors.Wrap(err, prefix)
252252
}
@@ -263,7 +263,7 @@ func (c *Client) DeleteFromS3ByPrefix(prefix string, continueIfFailure bool) err
263263

264264
var subErr error
265265

266-
err := c.s3Client.ListObjectsV2Pages(listObjectsInput,
266+
err := c.S3.ListObjectsV2Pages(listObjectsInput,
267267
func(listObjectsOutput *s3.ListObjectsV2Output, lastPage bool) bool {
268268
deleteObjects := make([]*s3.ObjectIdentifier, len(listObjectsOutput.Contents))
269269
for i, object := range listObjectsOutput.Contents {
@@ -276,7 +276,7 @@ func (c *Client) DeleteFromS3ByPrefix(prefix string, continueIfFailure bool) err
276276
Quiet: aws.Bool(true),
277277
},
278278
}
279-
_, newSubErr := c.s3Client.DeleteObjects(deleteObjectsInput)
279+
_, newSubErr := c.S3.DeleteObjects(deleteObjectsInput)
280280
if newSubErr != nil {
281281
subErr = newSubErr
282282
if !continueIfFailure {

pkg/operator/api/userconfig/apis.go

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@ package userconfig
1818

1919
import (
2020
"fmt"
21+
"path/filepath"
22+
"strconv"
2123
"strings"
2224

25+
"github.com/aws/aws-sdk-go/service/s3"
2326
"github.com/cortexlabs/cortex/pkg/consts"
2427
"github.com/cortexlabs/cortex/pkg/lib/aws"
2528
cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
@@ -147,6 +150,45 @@ func IsValidTensorFlowS3Directory(path string, awsClient *aws.Client) bool {
147150
return true
148151
}
149152

153+
func GetTFServingExportFromS3Path(path string, awsClient *aws.Client) (string, error) {
154+
if IsValidTensorFlowS3Directory(path, awsClient) {
155+
return path, nil
156+
}
157+
158+
bucket, prefix, err := aws.SplitS3Path(path)
159+
if err != nil {
160+
return "", err
161+
}
162+
163+
resp, _ := awsClient.S3.ListObjects(&s3.ListObjectsInput{
164+
Bucket: &bucket,
165+
Prefix: &prefix,
166+
})
167+
168+
highestVersion := int64(0)
169+
var highestPath string
170+
for _, key := range resp.Contents {
171+
if !strings.HasSuffix(*key.Key, "saved_model.pb") {
172+
continue
173+
}
174+
175+
keyParts := strings.Split(*key.Key, "/")
176+
versionStr := keyParts[len(keyParts)-1]
177+
version, err := strconv.ParseInt(versionStr, 10, 64)
178+
if err != nil {
179+
version = 0
180+
}
181+
182+
possiblePath := "s3://" + filepath.Join(bucket, filepath.Join(keyParts[:len(keyParts)-1]...))
183+
if version >= highestVersion && IsValidTensorFlowS3Directory(possiblePath, awsClient) {
184+
highestVersion = version
185+
highestPath = possiblePath
186+
}
187+
}
188+
189+
return highestPath, nil
190+
}
191+
150192
func (api *API) UserConfigStr() string {
151193
var sb strings.Builder
152194
sb.WriteString(api.ResourceFields.UserConfigStr())
@@ -190,27 +232,31 @@ func (api *API) Validate() error {
190232
return err
191233
}
192234

193-
switch api.ModelFormat {
194-
case ONNXModelFormat:
235+
switch {
236+
case api.ModelFormat == ONNXModelFormat:
195237
if ok, err := awsClient.IsS3PathFile(api.Model); err != nil || !ok {
196238
return errors.Wrap(ErrorExternalNotFound(api.Model), Identify(api), ModelKey)
197239
}
198-
case TensorFlowModelFormat:
199-
if !IsValidTensorFlowS3Directory(api.Model, awsClient) {
240+
case api.ModelFormat == TensorFlowModelFormat:
241+
path, err := GetTFServingExportFromS3Path(api.Model, awsClient)
242+
if path == "" || err != nil {
200243
return errors.Wrap(ErrorInvalidTensorflowDir(api.Model), Identify(api), ModelKey)
201244
}
245+
case strings.HasSuffix(api.Model, ".onnx"):
246+
api.ModelFormat = ONNXModelFormat
247+
if ok, err := awsClient.IsS3PathFile(api.Model); err != nil || !ok {
248+
return errors.Wrap(ErrorExternalNotFound(api.Model), Identify(api), ModelKey)
249+
}
202250
default:
203-
switch {
204-
case strings.HasSuffix(api.Model, ".onnx"):
205-
api.ModelFormat = ONNXModelFormat
206-
if ok, err := awsClient.IsS3PathFile(api.Model); err != nil || !ok {
207-
return errors.Wrap(ErrorExternalNotFound(api.Model), Identify(api), ModelKey)
208-
}
209-
case IsValidTensorFlowS3Directory(api.Model, awsClient):
210-
api.ModelFormat = TensorFlowModelFormat
211-
default:
251+
path, err := GetTFServingExportFromS3Path(api.Model, awsClient)
252+
if err != nil {
253+
return errors.Wrap(err, Identify(api), ModelKey)
254+
}
255+
if path == "" {
212256
return errors.Wrap(ErrorUnableToInferModelFormat(api.Model), Identify(api))
213257
}
258+
api.ModelFormat = TensorFlowModelFormat
259+
api.Model = path
214260
}
215261

216262
if api.ModelFormat == TensorFlowModelFormat && api.TFServing == nil {

pkg/operator/api/userconfig/errors.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,8 @@ func ErrorExternalNotFound(path string) error {
277277

278278
var onnxExpectedStructMessage = `For ONNX models, the path should end in .onnx`
279279

280-
var tfExpectedStructMessage = `For TensorFlow models, the path should be a directory with the following structure:
281-
1523423423/ (version prefix, usually a timestamp)
280+
var tfExpectedStructMessage = `For TensorFlow models, the path must contain a directory with the following structure:
281+
1523423423/ (Version prefix, usually a timestamp)
282282
├── saved_model.pb
283283
└── variables/
284284
├── variables.index

pkg/operator/config/config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ type CortexConfig struct {
4646
OperatorImage string `json:"operator_image"`
4747
TFServeImage string `json:"tf_serve_image"`
4848
TFAPIImage string `json:"tf_api_image"`
49+
DownloaderImage string `json:"downloader_image"`
4950
PythonPackagerImage string `json:"python_packager_image"`
5051
TFServeImageGPU string `json:"tf_serve_image_gpu"`
5152
ONNXServeImage string `json:"onnx_serve_image"`
@@ -66,6 +67,7 @@ func Init() error {
6667
OperatorImage: getStr("IMAGE_OPERATOR"),
6768
TFServeImage: getStr("IMAGE_TF_SERVE"),
6869
TFAPIImage: getStr("IMAGE_TF_API"),
70+
DownloaderImage: getStr("IMAGE_DOWNLOADER"),
6971
PythonPackagerImage: getStr("IMAGE_PYTHON_PACKAGER"),
7072
TFServeImageGPU: getStr("IMAGE_TF_SERVE_GPU"),
7173
ONNXServeImage: getStr("IMAGE_ONNX_SERVE"),

0 commit comments

Comments
 (0)