googledatalab
diff --git a/‎datalab/mlalpha/_dataset.py
Lines changed: 52 additions & 16 deletions b/‎datalab/mlalpha/_dataset.py
Lines changed: 52 additions & 16 deletions
diff --git a/‎solutionbox/inception/datalab_solutions/inception/_cloud.py
Lines changed: 30 additions & 8 deletions b/‎solutionbox/inception/datalab_solutions/inception/_cloud.py
Lines changed: 30 additions & 8 deletions
diff --git a/‎solutionbox/inception/datalab_solutions/inception/_local.py
Lines changed: 14 additions & 10 deletions b/‎solutionbox/inception/datalab_solutions/inception/_local.py
Lines changed: 14 additions & 10 deletions
diff --git a/‎solutionbox/inception/datalab_solutions/inception/_package.py
Lines changed: 69 additions & 20 deletions b/‎solutionbox/inception/datalab_solutions/inception/_package.py
Lines changed: 69 additions & 20 deletions
@@ -37,15 +37,30 @@ def __init__(self, files, schema=None, schema_file=None):
       schema: A BigQuery schema object in the form of 
           [{'name': 'col1', 'type': 'STRING'},
            {'name': 'col2', 'type': 'INTEGER'}]
+          or a single string in of the form 'col1:STRING,col2:INTEGER,col3:FLOAT'.
       schema_file: A JSON serialized schema file. If schema is None, it will try to load from
           schema_file if not None.
+    Raise:
+      ValueError if both schema and schema_file are None.
     """
-    self._schema = None
+    if schema is None and schema_file is None:
+      raise ValueError('schema and schema_file cannot both be None.')
+
     if schema is not None:
-      self._schema = schema
-    elif schema_file is not None:
+      if isinstance(schema, list):
+        self._schema = schema
+      else:
+        self._schema = []
+        for x in schema.split(','):
+          parts = x.split(':')
+          if len(parts) != 2:
+            raise ValueError('invalid schema string "%s"' % x)
+          self._schema.append({'name': parts[0].strip(), 'type': parts[1].strip()})
+    else:
       with ml.util._file.open_local_or_gcs(schema_file, 'r') as f:
         self._schema = json.load(f)
+    if isinstance(files, basestring):
+      files = [files]
     self._files = []
     for file in files:
       # glob_files() returns unicode strings which doesn't make DataFlow happy. So str().
@@ -97,28 +112,48 @@ def sample(self, n):
       skip = [x for x in skip_all if x < row_count]
       skip_all = [x - row_count for x in skip_all if x >= row_count]
       with ml.util._file.open_local_or_gcs(file, 'r') as f:
-        dfs.append(pd.read_csv(file, skiprows=skip, names=names, dtype=dtype, header=None))
+        dfs.append(pd.read_csv(f, skiprows=skip, names=names, dtype=dtype, header=None))
     return pd.concat(dfs, axis=0, ignore_index=True)
 
 
 class BigQueryDataSet(object):
   """DataSet based on BigQuery table or query."""
 
-  def __init__(self, sql):
+  def __init__(self, sql=None, table=None):
     """
     Args:
-      sql: Can be one of:
-          A table name.
-          A SQL query string.
-          A SQL Query module defined with '%%sql --name [module_name]'
+      sql: A SQL query string, or a SQL Query module defined with '%%sql --name [module_name]'
+      table: A table name in the form of "dataset:table".
+    Raises:
+      ValueError if both sql and table are set, or both are None.
     """
-    query, _ = datalab.data.SqlModule.get_sql_statement_with_environment(sql, {})
-    self._sql = query.sql
+    if (sql is None and table is None) or (sql is not None and table is not None):
+      raise ValueError('One and only one of sql and table should be set.')
+
+    self._query = None
+    self._table = None
+    if sql is not None:
+      query, _ = datalab.data.SqlModule.get_sql_statement_with_environment(sql, {})
+      self._query = query.sql
+    if table is not None:
+      self._table = table
+    self._schema = None
 
   @property
-  def sql(self):
-    return self._sql
-  
+  def query(self):
+    return self._query
+
+  @property
+  def table(self):
+    return self._table
+
+  @property
+  def schema(self):
+    if self._schema is None:
+      source = self._query or self._table
+      self._schema = bq.Query('SELECT * FROM (%s) LIMIT 1' % source).results().schema
+    return self._schema
+
   def sample(self, n):
     """Samples data into a Pandas DataFrame. Note that it calls BigQuery so it will
        incur cost.
@@ -129,10 +164,11 @@ def sample(self, n):
     Raises:
       Exception if n is larger than number of rows.
     """
-    total = bq.Query('select count(*) from (%s)' % self._sql).results()[0].values()[0]
+    source = self._query or self._table
+    total = bq.Query('select count(*) from (%s)' % source).results()[0].values()[0]
     if n > total:
       raise ValueError('sample larger than population')
     sampling = bq.Sampling.random(n*100.0/float(total))
-    sample = bq.Query(self._sql).sample(sampling=sampling)
+    sample = bq.Query(source).sample(sampling=sampling)
     df = sample.to_dataframe()
     return df
@@ -26,6 +26,7 @@
 
 
 from . import _model
+from . import _predictor
 from . import _preprocess
 from . import _trainer
 from . import _util
@@ -55,7 +56,7 @@ def _repackage_to_staging(self, output_path):
     mlalpha.package_and_copy(package_root, _SETUP_PY, staging_package_url)
     return staging_package_url
 
-  def preprocess(self, dataset, output_dir, pipeline_option=None):
+  def preprocess(self, train_dataset, eval_dataset, output_dir, pipeline_option):
     """Cloud preprocessing with Cloud DataFlow."""
 
     import datalab.mlalpha as mlalpha
@@ -76,13 +77,8 @@ def preprocess(self, dataset, output_dir, pipeline_option=None):
 
     opts = beam.pipeline.PipelineOptions(flags=[], **options)
     p = beam.Pipeline('DataflowRunner', options=opts)
-    if type(dataset) is mlalpha.CsvDataSet:
-      _preprocess.configure_pipeline_csv(p, self._checkpoint, dataset.files, output_dir, job_name)
-    elif type(dataset) is mlalpha.BigQueryDataSet:
-      _preprocess.configure_pipeline_bigquery(p, self._checkpoint, dataset.sql,
-                                              output_dir, job_name)
-    else:
-      raise ValueError('preprocess takes CsvDataSet or BigQueryDataset only.')
+    _preprocess.configure_pipeline(p, train_dataset, eval_dataset, self._checkpoint,
+        output_dir, job_name)
     p.run()
     return job_name
 
@@ -136,3 +132,29 @@ def predict(self, model_id, image_files):
     labels_and_scores = [(x['prediction'], x['scores'][labels.index(x['prediction'])])
                          for x in predictions]
     return labels_and_scores
+
+  def batch_predict(self, dataset, model_dir, gcs_staging_location, output_csv,
+                    output_bq_table, pipeline_option):
+    """Cloud batch prediction with a model specified by a GCS directory."""
+
+    import datalab.mlalpha as mlalpha
+
+    job_name = 'batch-predict-inception-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')
+    staging_package_url = self._repackage_to_staging(gcs_staging_location)
+    options = {
+        'staging_location': os.path.join(gcs_staging_location, 'tmp', 'staging'),
+        'temp_location': os.path.join(gcs_staging_location, 'tmp'),
+        'job_name': job_name,
+        'project': _util.default_project(),
+        'extra_packages': [ml.sdk_location, staging_package_url, _TF_GS_URL],
+        'teardown_policy': 'TEARDOWN_ALWAYS',
+        'no_save_main_session': True
+    }
+    if pipeline_option is not None:
+      options.update(pipeline_option)
+
+    opts = beam.pipeline.PipelineOptions(flags=[], **options)
+    p = beam.Pipeline('DataflowRunner', options=opts)
+    _predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table)
+    p.run()
+    return job_name
@@ -42,7 +42,7 @@ def __init__(self, checkpoint=None):
     if self._checkpoint is None:
       self._checkpoint = _util._DEFAULT_CHECKPOINT_GSURL
 
-  def preprocess(self, dataset, output_dir):
+  def preprocess(self, train_dataset, eval_dataset, output_dir):
     """Local preprocessing with local DataFlow."""
 
     import datalab.mlalpha as mlalpha
@@ -53,12 +53,8 @@ def preprocess(self, dataset, output_dir):
     }
     opts = beam.pipeline.PipelineOptions(flags=[], **options)
     p = beam.Pipeline('DirectRunner', options=opts)
-    if type(dataset) is mlalpha.CsvDataSet:
-      _preprocess.configure_pipeline_csv(p, self._checkpoint, dataset.files, output_dir, job_id)
-    elif type(dataset) is mlalpha.BigQueryDataSet:
-      _preprocess.configure_pipeline_bigquery(p, self._checkpoint, dataset.sql, output_dir, job_id)
-    else:
-      raise ValueError('preprocess takes CsvDataSet or BigQueryDataset only.')
+    _preprocess.configure_pipeline(p, train_dataset, eval_dataset,
+        self._checkpoint, output_dir, job_id)
     p.run().wait_until_finish()
 
   def train(self, input_dir, batch_size, max_steps, output_dir):
@@ -77,7 +73,15 @@ def predict(self, model_dir, image_files):
     return _predictor.predict(model_dir, image_files)
 
 
-  def batch_predict(self, model_dir, input_csv, output_file, output_bq_table):
+  def batch_predict(self, dataset, model_dir, output_csv, output_bq_table):
     """Local batch prediction."""
-
-    return _predictor.batch_predict(model_dir, input_csv, output_file, output_bq_table)
+    import datalab.mlalpha as mlalpha
+    job_id = 'inception_batch_predict_' + datetime.datetime.now().strftime('%y%m%d_%H%M%S')
+    # Project is needed for bigquery data source, even in local run.
+    options = {
+        'project': _util.default_project(),
+    }
+    opts = beam.pipeline.PipelineOptions(flags=[], **options)
+    p = beam.Pipeline('DirectRunner', options=opts)
+    _predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table)
+    p.run().wait_until_finish()
@@ -36,39 +36,49 @@
 from . import _util
 
 
-def local_preprocess(dataset, output_dir, checkpoint=None):
+def local_preprocess(train_dataset, output_dir, checkpoint=None, eval_dataset=None):
   """Preprocess data locally. Produce output that can be used by training efficiently.
   Args:
-    dataset: data source to preprocess. Can be either datalab.mlalpha.CsvDataset, or
-        datalab.mlalpha.BigQueryDataSet.
+    train_dataset: training data source to preprocess. Can be CsvDataset or BigQueryDataSet.
+        If eval_dataset is None, the pipeline will randomly split train_dataset into
+        train/eval set with 7:3 ratio.
     output_dir: The output directory to use. Preprocessing will create a sub directory under
         it for each run, and also update "latest" file which points to the latest preprocessed
         directory. Users are responsible for cleanup. Can be local or GCS path.
     checkpoint: the Inception checkpoint to use.
+    eval_dataset: evaluation data source to preprocess. Can be CsvDataset or BigQueryDataSet.
+        If specified, it will be used for evaluation during training, and train_dataset will be
+        completely used for training.
   """
 
   print 'Local preprocessing...'
   # TODO: Move this to a new process to avoid pickling issues
   # TODO: Expose train/eval split ratio
-  _local.Local(checkpoint).preprocess(dataset, output_dir)
+  _local.Local(checkpoint).preprocess(train_dataset, eval_dataset, output_dir)
   print 'Done'
 
 
-def cloud_preprocess(dataset, output_dir, checkpoint=None, pipeline_option=None):
+def cloud_preprocess(train_dataset, output_dir, checkpoint=None, pipeline_option=None,
+                     eval_dataset=None):
   """Preprocess data in Cloud with DataFlow.
      Produce output that can be used by training efficiently.
   Args:
-    dataset: data source to preprocess. Can be either datalab.mlalpha.CsvDataset, or
-        datalab.mlalpha.BigQueryDataSet. For CsvDataSet, all files need to be in GCS.
+    train_dataset: training data source to preprocess. Can be CsvDataset or BigQueryDataSet.
+        For CsvDataSet, all files must be in GCS.
+        If eval_dataset is None, the pipeline will randomly split train_dataset into
+        train/eval set with 7:3 ratio.
     output_dir: The output directory to use. Preprocessing will create a sub directory under
         it for each run, and also update "latest" file which points to the latest preprocessed
         directory. Users are responsible for cleanup. GCS path only.
     checkpoint: the Inception checkpoint to use.
+    pipeline_option: DataFlow pipeline options in a dictionary.
+    eval_dataset: evaluation data source to preprocess. Can be CsvDataset or BigQueryDataSet.
+        If specified, it will be used for evaluation during training, and train_dataset will be
+        completely used for training.
   """
 
-  # TODO: Move this to a new process to avoid pickling issues
-  # TODO: Expose train/eval split ratio
-  job_name = _cloud.Cloud(checkpoint=checkpoint).preprocess(dataset, output_dir, pipeline_option)
+  job_name = _cloud.Cloud(checkpoint=checkpoint).preprocess(train_dataset, eval_dataset,
+      output_dir, pipeline_option)
   if (_util.is_in_IPython()):
     import IPython
 
@@ -172,19 +182,58 @@ def cloud_predict(model_id, image_files, show_image=True):
   _display_predict_results(results, show_image)
 
 
-def local_batch_predict(model_dir, input_csv, output_file, output_bq_table=None):
-  """Batch predict using an offline model.
+def local_batch_predict(dataset, model_dir, output_csv=None, output_bq_table=None):
+  """Batch predict running locally.
   Args:
+    dataset: CsvDataSet or BigQueryDataSet for batch prediction input. Can contain either
+        one column 'image_url', or two columns with another being 'label'.
     model_dir: The directory of a trained inception model. Can be local or GCS paths.
-    input_csv: The input csv which include two columns only: image_gs_url, label.
-        Can be local or GCS paths.
-    output_file: The output csv file containing prediction results.
-    output_bq_table: If provided, will also save the results to BigQuery table.
+    output_csv: The output csv file for prediction results. If specified,
+        it will also output a csv schema file with the name output_csv + '.schema.json'.
+    output_bq_table: if specified, the output BigQuery table for prediction results.
+        output_csv and output_bq_table can both be set.
+  Raises:
+    ValueError if both output_csv and output_bq_table are None.
   """
+
+  if output_csv is None and output_bq_table is None:
+    raise ValueError('output_csv and output_bq_table cannot both be None.')
+
   print('Predicting...')
-  _local.Local().batch_predict(model_dir, input_csv, output_file, output_bq_table)
+  _local.Local().batch_predict(dataset, model_dir, output_csv, output_bq_table)
   print('Done')
 
-def cloud_batch_predict(model_dir, image_files, show_image=True, output_file=None):
-  """Not Implemented Yet"""
-  pass
+
+def cloud_batch_predict(dataset, model_dir, gcs_staging_location,
+                        output_csv=None, output_bq_table=None, pipeline_option=None):
+  """Batch predict running in cloud.
+
+  Args:
+    dataset: CsvDataSet or BigQueryDataSet for batch prediction input. Can contain either
+        one column 'image_url', or two columns with another being 'label'.
+    model_dir: A GCS path to a trained inception model directory.
+    gcs_staging_location: A temporary location for DataFlow staging.
+    output_csv: If specified, prediction results will be saved to the specified Csv file.
+        It will also output a csv schema file with the name output_csv + '.schema.json'.
+        GCS file path only.
+    output_bq_table: If specified, prediction results will be saved to the specified BigQuery
+        table. output_csv and output_bq_table can both be set, but cannot be both None.
+    pipeline_option: DataFlow pipeline options in a dictionary.
+  Raises:
+    ValueError if both output_csv and output_bq_table are None.
+  """
+
+  if output_csv is None and output_bq_table is None:
+    raise ValueError('output_csv and output_bq_table cannot both be None.')
+  
+  job_name = _cloud.Cloud().batch_predict(dataset, model_dir,
+      gcs_staging_location, output_csv, output_bq_table, pipeline_option)
+  if (_util.is_in_IPython()):
+    import IPython
+    
+    dataflow_url = ('https://console.developers.google.com/dataflow?project=%s' %
+                   _util.default_project())
+    html = 'Job "%s" submitted.' % job_name
+    html += ('<p>Click <a href="%s" target="_blank">here</a> to track batch prediction job. <br/>'
+             % dataflow_url)
+    IPython.display.display_html(html, raw=True)