Add CloudTrainingConfig namedtuple to wrap cloud training configurations (#178)

qimingj · qimingj · commit efc5b88fe379 · 2017-02-13T14:14:14.000-08:00
* Add CloudTrainingConfig namedtuple to wrap cloud training configurations.

* Follow up code review comments.
diff --git a/datalab/mlalpha/__init__.py b/datalab/mlalpha/__init__.py
@@ -29,6 +29,7 @@
 from ._analysis import csv_to_dataframe
 from ._package_runner import PackageRunner
 from ._feature_slice_view import FeatureSliceView
+from ._cloud_training_config import CloudTrainingConfig
 from ._util import *
 
 
diff --git a/datalab/mlalpha/_cloud_training_config.py b/datalab/mlalpha/_cloud_training_config.py
@@ -0,0 +1,47 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+
+_CloudTrainingConfig = namedtuple("CloudConfig", 
+    ['region', 'scale_tier', 'master_type', 'worker_type',
+     'parameter_server_type', 'worker_count', 'parameter_server_count'])
+_CloudTrainingConfig.__new__.__defaults__ = ('BASIC', None, None, None, None, None)
+
+
+class CloudTrainingConfig(_CloudTrainingConfig):
+    """A config namedtuple containing cloud specific configurations for CloudML training.
+    
+    Fields:
+      region: the region of the training job to be submitted. For example, "us-central1".
+          Run "gcloud compute regions list" to get a list of regions.
+      scale_tier: Specifies the machine types, the number of replicas for workers and
+          parameter servers. For example, "STANDARD_1". See
+          https://cloud.google.com/ml/reference/rest/v1beta1/projects.jobs#scaletier
+          for list of accepted values.
+      master_type: specifies the type of virtual machine to use for your training
+          job's master worker. Must set this value when scale_tier is set to CUSTOM.
+          See the link in "scale_tier".
+      worker_type: specifies the type of virtual machine to use for your training
+          job's worker nodes. Must set this value when scale_tier is set to CUSTOM.
+      parameter_server_type: specifies the type of virtual machine to use for your training
+          job's parameter server. Must set this value when scale_tier is set to CUSTOM.
+      worker_count: the number of worker replicas to use for the training job. Each
+          replica in the cluster will be of the type specified in "worker_type".
+          Must set this value when scale_tier is set to CUSTOM.
+      parameter_server_count: the number of parameter server replicas to use. Each
+          replica in the cluster will be of the type specified in "parameter_server_type".
+          Must set this value when scale_tier is set to CUSTOM.  
+    """
+    pass
diff --git a/solutionbox/inception/datalab_solutions/inception/_cloud.py b/solutionbox/inception/datalab_solutions/inception/_cloud.py
@@ -86,8 +86,7 @@ def preprocess(self, dataset, output_dir, pipeline_option=None):
     p.run()
     return job_name
 
-  def train(self, input_dir, batch_size, max_steps, output_path,
-            region, scale_tier):
+  def train(self, input_dir, batch_size, max_steps, output_path, cloud_train_config):
     """Cloud training with CloudML trainer service."""
 
     import datalab.mlalpha as mlalpha
@@ -103,10 +102,9 @@ def train(self, input_dir, batch_size, max_steps, output_path,
     job_request = {
       'package_uris': staging_package_url,
       'python_module': 'datalab_solutions.inception.task',
-      'scale_tier': scale_tier,
-      'region': region,
       'args': job_args
     }
+    job_request.update(dict(cloud_train_config._asdict()))
     cloud_runner = mlalpha.CloudRunner(job_request)
     job_id = 'inception_train_' + datetime.datetime.now().strftime('%y%m%d_%H%M%S')
     return cloud_runner.run(job_id)
diff --git a/solutionbox/inception/datalab_solutions/inception/_package.py b/solutionbox/inception/datalab_solutions/inception/_package.py
@@ -102,19 +102,20 @@ def local_train(input_dir, batch_size, max_steps, output_dir, checkpoint=None):
 
 
 def cloud_train(input_dir, batch_size, max_steps, output_dir,
-                region, scale_tier='BASIC', checkpoint=None):
+                cloud_train_config, checkpoint=None):
   """Train model in the cloud with CloudML trainer service.
      The output can be used for local prediction or for online deployment.
   Args:
     input_dir: A directory path containing preprocessed results. GCS path only.
     batch_size: size of batch used for training.
     max_steps: number of steps to train.
     output_dir: The output directory to use. GCS path only.
+    cloud_train_config: a datalab.ml.CloudTrainingConfig object.
     checkpoint: the Inception checkpoint to use.
   """
 
   job_info = _cloud.Cloud(checkpoint=checkpoint).train(input_dir, batch_size,
-      max_steps, output_dir, region, scale_tier)
+      max_steps, output_dir, cloud_train_config)
   if (_util.is_in_IPython()):
     import IPython
     log_url_query_strings = {