Skip to content
This repository was archived by the owner on Sep 3, 2022. It is now read-only.

Improve inception package so there is no need to have an GCS copy. #175

Merged
merged 1 commit into from
Feb 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions solutionbox/inception/datalab_solutions/inception/_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@


_TF_GS_URL= 'gs://cloud-datalab/deploy/tf/tensorflow-0.12.0rc1-cp27-none-linux_x86_64.whl'

# Keep in sync with "data_files" in package's setup.py
_SETUP_PY = '/datalab/packages_setup/inception/setup.py'

class Cloud(object):
"""Class for cloud training, preprocessing and prediction."""
Expand All @@ -42,17 +43,31 @@ def __init__(self, checkpoint=None):
if self._checkpoint is None:
self._checkpoint = _util._DEFAULT_CHECKPOINT_GSURL

def _repackage_to_staging(self, output_path):
"""Repackage inception from local installed location and copy it to GCS.
"""

import datalab.mlalpha as mlalpha

# Find the package root. __file__ is under [package_root]/datalab_solutions/inception.
package_root = os.path.join(os.path.dirname(__file__), '../../')
staging_package_url = os.path.join(output_path, 'staging', 'inception.tar.gz')
mlalpha.package_and_copy(package_root, _SETUP_PY, staging_package_url)
return staging_package_url

def preprocess(self, dataset, output_dir, pipeline_option=None):
"""Cloud preprocessing with Cloud DataFlow."""

import datalab.mlalpha as mlalpha

job_name = 'preprocess-inception-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')
staging_package_url = self._repackage_to_staging(output_dir)
options = {
'staging_location': os.path.join(output_dir, 'tmp', 'staging'),
'temp_location': os.path.join(output_dir, 'tmp'),
'job_name': job_name,
'project': _util.default_project(),
'extra_packages': [ml.sdk_location, _util._PACKAGE_GS_URL, _TF_GS_URL],
'extra_packages': [ml.sdk_location, staging_package_url, _TF_GS_URL],
'teardown_policy': 'TEARDOWN_ALWAYS',
'no_save_main_session': True
}
Expand All @@ -64,7 +79,8 @@ def preprocess(self, dataset, output_dir, pipeline_option=None):
if type(dataset) is mlalpha.CsvDataSet:
_preprocess.configure_pipeline_csv(p, self._checkpoint, dataset.files, output_dir, job_name)
elif type(dataset) is mlalpha.BigQueryDataSet:
_preprocess.configure_pipeline_bigquery(p, self._checkpoint, dataset.sql, output_dir, job_name)
_preprocess.configure_pipeline_bigquery(p, self._checkpoint, dataset.sql,
output_dir, job_name)
else:
raise ValueError('preprocess takes CsvDataSet or BigQueryDataset only.')
p.run()
Expand All @@ -75,6 +91,8 @@ def train(self, input_dir, batch_size, max_steps, output_path,
"""Cloud training with CloudML trainer service."""

import datalab.mlalpha as mlalpha

staging_package_url = self._repackage_to_staging(output_path)
job_args = {
'input_dir': input_dir,
'output_path': output_path,
Expand All @@ -83,7 +101,7 @@ def train(self, input_dir, batch_size, max_steps, output_path,
'checkpoint': self._checkpoint
}
job_request = {
'package_uris': _util._PACKAGE_GS_URL,
'package_uris': staging_package_url,
'python_module': 'datalab_solutions.inception.task',
'scale_tier': scale_tier,
'region': region,
Expand Down
1 change: 0 additions & 1 deletion solutionbox/inception/datalab_solutions/inception/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from tensorflow.python.lib.io import file_io


_PACKAGE_GS_URL = 'gs://cloud-datalab/packages/inception-0.1.tar.gz'
_DEFAULT_CHECKPOINT_GSURL = 'gs://cloud-ml-data/img/flower_photos/inception_v3_2016_08_28.ckpt'


Expand Down
21 changes: 0 additions & 21 deletions solutionbox/inception/release.sh

This file was deleted.

3 changes: 3 additions & 0 deletions solutionbox/inception/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
'datalab_solutions',
'datalab_solutions.inception',
],
# setup.py needs to be deployed so it can be repackaged from local installation for cloud run.
data_files=[('/datalab/packages_setup/inception', ['setup.py'])],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can it be relative? This will not work outside the vm, right?

Copy link
Contributor Author

@qimingj qimingj Feb 9, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can be relative, but AFAIK it is relative to sys.prefix (which also depends on what user it runs under). This should work with Linux/Mac, if user installs it with root. Pip will create the dir.


description='Google Cloud Datalab Inception Package',
author='Google',
author_email='google-cloud-datalab-feedback@googlegroups.com',
Expand Down