|
36 | 36 | from . import _util
|
37 | 37 |
|
38 | 38 |
|
39 |
| -def local_preprocess(dataset, output_dir, checkpoint=None): |
| 39 | +def local_preprocess(train_dataset, output_dir, checkpoint=None, eval_dataset=None): |
40 | 40 | """Preprocess data locally. Produce output that can be used by training efficiently.
|
41 | 41 | Args:
|
42 |
| - dataset: data source to preprocess. Can be either datalab.mlalpha.CsvDataset, or |
43 |
| - datalab.mlalpha.BigQueryDataSet. |
| 42 | + train_dataset: training data source to preprocess. Can be CsvDataset or BigQueryDataSet. |
| 43 | + If eval_dataset is None, the pipeline will randomly split train_dataset into |
| 44 | + train/eval set with 7:3 ratio. |
44 | 45 | output_dir: The output directory to use. Preprocessing will create a sub directory under
|
45 | 46 | it for each run, and also update "latest" file which points to the latest preprocessed
|
46 | 47 | directory. Users are responsible for cleanup. Can be local or GCS path.
|
47 | 48 | checkpoint: the Inception checkpoint to use.
|
| 49 | + eval_dataset: evaluation data source to preprocess. Can be CsvDataset or BigQueryDataSet. |
| 50 | + If specified, it will be used for evaluation during training, and train_dataset will be |
| 51 | + completely used for training. |
48 | 52 | """
|
49 | 53 |
|
50 | 54 | print 'Local preprocessing...'
|
51 | 55 | # TODO: Move this to a new process to avoid pickling issues
|
52 | 56 | # TODO: Expose train/eval split ratio
|
53 |
| - _local.Local(checkpoint).preprocess(dataset, output_dir) |
| 57 | + _local.Local(checkpoint).preprocess(train_dataset, eval_dataset, output_dir) |
54 | 58 | print 'Done'
|
55 | 59 |
|
56 | 60 |
|
57 |
| -def cloud_preprocess(dataset, output_dir, checkpoint=None, pipeline_option=None): |
| 61 | +def cloud_preprocess(train_dataset, output_dir, checkpoint=None, pipeline_option=None, |
| 62 | + eval_dataset=None): |
58 | 63 | """Preprocess data in Cloud with DataFlow.
|
59 | 64 | Produce output that can be used by training efficiently.
|
60 | 65 | Args:
|
61 |
| - dataset: data source to preprocess. Can be either datalab.mlalpha.CsvDataset, or |
62 |
| - datalab.mlalpha.BigQueryDataSet. For CsvDataSet, all files need to be in GCS. |
| 66 | + train_dataset: training data source to preprocess. Can be CsvDataset or BigQueryDataSet. |
| 67 | + For CsvDataSet, all files must be in GCS. |
| 68 | + If eval_dataset is None, the pipeline will randomly split train_dataset into |
| 69 | + train/eval set with 7:3 ratio. |
63 | 70 | output_dir: The output directory to use. Preprocessing will create a sub directory under
|
64 | 71 | it for each run, and also update "latest" file which points to the latest preprocessed
|
65 | 72 | directory. Users are responsible for cleanup. GCS path only.
|
66 | 73 | checkpoint: the Inception checkpoint to use.
|
| 74 | + pipeline_option: DataFlow pipeline options in a dictionary. |
| 75 | + eval_dataset: evaluation data source to preprocess. Can be CsvDataset or BigQueryDataSet. |
| 76 | + If specified, it will be used for evaluation during training, and train_dataset will be |
| 77 | + completely used for training. |
67 | 78 | """
|
68 | 79 |
|
69 |
| - # TODO: Move this to a new process to avoid pickling issues |
70 |
| - # TODO: Expose train/eval split ratio |
71 |
| - job_name = _cloud.Cloud(checkpoint=checkpoint).preprocess(dataset, output_dir, pipeline_option) |
| 80 | + job_name = _cloud.Cloud(checkpoint=checkpoint).preprocess(train_dataset, eval_dataset, |
| 81 | + output_dir, pipeline_option) |
72 | 82 | if (_util.is_in_IPython()):
|
73 | 83 | import IPython
|
74 | 84 |
|
@@ -172,19 +182,58 @@ def cloud_predict(model_id, image_files, show_image=True):
|
172 | 182 | _display_predict_results(results, show_image)
|
173 | 183 |
|
174 | 184 |
|
175 |
| -def local_batch_predict(model_dir, input_csv, output_file, output_bq_table=None): |
176 |
| - """Batch predict using an offline model. |
| 185 | +def local_batch_predict(dataset, model_dir, output_csv=None, output_bq_table=None): |
| 186 | + """Batch predict running locally. |
177 | 187 | Args:
|
| 188 | + dataset: CsvDataSet or BigQueryDataSet for batch prediction input. Can contain either |
| 189 | + one column 'image_url', or two columns with another being 'label'. |
178 | 190 | model_dir: The directory of a trained inception model. Can be local or GCS paths.
|
179 |
| - input_csv: The input csv which include two columns only: image_gs_url, label. |
180 |
| - Can be local or GCS paths. |
181 |
| - output_file: The output csv file containing prediction results. |
182 |
| - output_bq_table: If provided, will also save the results to BigQuery table. |
| 191 | + output_csv: The output csv file for prediction results. If specified, |
| 192 | + it will also output a csv schema file with the name output_csv + '.schema.json'. |
| 193 | + output_bq_table: if specified, the output BigQuery table for prediction results. |
| 194 | + output_csv and output_bq_table can both be set. |
| 195 | + Raises: |
| 196 | + ValueError if both output_csv and output_bq_table are None. |
183 | 197 | """
|
| 198 | + |
| 199 | + if output_csv is None and output_bq_table is None: |
| 200 | + raise ValueError('output_csv and output_bq_table cannot both be None.') |
| 201 | + |
184 | 202 | print('Predicting...')
|
185 |
| - _local.Local().batch_predict(model_dir, input_csv, output_file, output_bq_table) |
| 203 | + _local.Local().batch_predict(dataset, model_dir, output_csv, output_bq_table) |
186 | 204 | print('Done')
|
187 | 205 |
|
188 |
| -def cloud_batch_predict(model_dir, image_files, show_image=True, output_file=None): |
189 |
| - """Not Implemented Yet""" |
190 |
| - pass |
| 206 | + |
| 207 | +def cloud_batch_predict(dataset, model_dir, gcs_staging_location, |
| 208 | + output_csv=None, output_bq_table=None, pipeline_option=None): |
| 209 | + """Batch predict running in cloud. |
| 210 | +
|
| 211 | + Args: |
| 212 | + dataset: CsvDataSet or BigQueryDataSet for batch prediction input. Can contain either |
| 213 | + one column 'image_url', or two columns with another being 'label'. |
| 214 | + model_dir: A GCS path to a trained inception model directory. |
| 215 | + gcs_staging_location: A temporary location for DataFlow staging. |
| 216 | + output_csv: If specified, prediction results will be saved to the specified Csv file. |
| 217 | + It will also output a csv schema file with the name output_csv + '.schema.json'. |
| 218 | + GCS file path only. |
| 219 | + output_bq_table: If specified, prediction results will be saved to the specified BigQuery |
| 220 | + table. output_csv and output_bq_table can both be set, but cannot be both None. |
| 221 | + pipeline_option: DataFlow pipeline options in a dictionary. |
| 222 | + Raises: |
| 223 | + ValueError if both output_csv and output_bq_table are None. |
| 224 | + """ |
| 225 | + |
| 226 | + if output_csv is None and output_bq_table is None: |
| 227 | + raise ValueError('output_csv and output_bq_table cannot both be None.') |
| 228 | + |
| 229 | + job_name = _cloud.Cloud().batch_predict(dataset, model_dir, |
| 230 | + gcs_staging_location, output_csv, output_bq_table, pipeline_option) |
| 231 | + if (_util.is_in_IPython()): |
| 232 | + import IPython |
| 233 | + |
| 234 | + dataflow_url = ('https://console.developers.google.com/dataflow?project=%s' % |
| 235 | + _util.default_project()) |
| 236 | + html = 'Job "%s" submitted.' % job_name |
| 237 | + html += ('<p>Click <a href="%s" target="_blank">here</a> to track batch prediction job. <br/>' |
| 238 | + % dataflow_url) |
| 239 | + IPython.display.display_html(html, raw=True) |
0 commit comments