sd training supports python 3 (#320)

brandondutra · web-flow · commit bfff3bc65c67 · 2017-03-24T15:36:59.000-07:00
* sw

* added e2e test

* raise logging level

* test only python 2

python 3 support for SD is close I think, but I'll continue splitting
this into smaller PRs.

* flake8

* add dataflow to test build, but not setup.py

* install dataflow in python 2.7 test only

* remove a print

* added local analyze test to python3

* sw

* sd training supports python 3

* flake8

* flake

* io.string() -&gt; stringIO.stringIO for python 2

* use my wrapper
diff --git a/solutionbox/structured_data/mltoolbox/_structured_data/_package.py b/solutionbox/structured_data/mltoolbox/_structured_data/_package.py
@@ -35,6 +35,7 @@
 import tempfile
 import json
 import glob
+import six
 import subprocess
 import pandas as pd
 from tensorflow.python.lib.io import file_io
@@ -430,8 +431,13 @@ def _get_abs_path(input_path):
 
     while p.poll() is None:
       line = p.stdout.readline()
+
+      if not six.PY2:
+        line = line.decode()
+
       if (line.startswith('INFO:tensorflow:global') or line.startswith('INFO:tensorflow:loss') or
               line.startswith('INFO:tensorflow:Saving dict')):
+
         sys.stdout.write(line)
   finally:
     if monitor_process:
diff --git a/solutionbox/structured_data/mltoolbox/_structured_data/preprocess/cloud_preprocess.py b/solutionbox/structured_data/mltoolbox/_structured_data/preprocess/cloud_preprocess.py
@@ -17,9 +17,9 @@
 from __future__ import print_function
 
 import argparse
-import io
 import json
 import os
+import six
 import sys
 
 
@@ -212,7 +212,7 @@ def run_categorical_analysis(table, schema_list, args):
         df = query.execute().result().to_dataframe()
 
       # Write the results to a file.
-      string_buff = io.StringIO()
+      string_buff = six.StringIO()
       df.to_csv(string_buff, index=False, header=False)
       file_io.write_string_to_file(out_file, string_buff.getvalue())
 
diff --git a/solutionbox/structured_data/mltoolbox/_structured_data/trainer/__init__.py b/solutionbox/structured_data/mltoolbox/_structured_data/trainer/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-import task
+from __future__ import absolute_import
+
+from . import task
 
 __all__ = ['task']
diff --git a/solutionbox/structured_data/mltoolbox/_structured_data/trainer/util.py b/solutionbox/structured_data/mltoolbox/_structured_data/trainer/util.py
@@ -17,6 +17,7 @@
 import multiprocessing
 import os
 import math
+import six
 
 import tensorflow as tf
 from tensorflow.python.lib.io import file_io
@@ -65,21 +66,16 @@ class NotFittedError(ValueError):
 # ==============================================================================
 
 
-def _copy_all(src_files, dest_dir):
-  # file_io.copy does not copy files into folders directly.
-  for src_file in src_files:
-    file_name = os.path.basename(src_file)
-    new_file_location = os.path.join(dest_dir, file_name)
-    file_io.copy(src_file, new_file_location, overwrite=True)
-
-
 def _recursive_copy(src_dir, dest_dir):
   """Copy the contents of src_dir into the folder dest_dir.
   Args:
     src_dir: gsc or local path.
     dest_dir: gcs or local path.
   When called, dest_dir should exist.
   """
+  src_dir = python_portable_string(src_dir)
+  dest_dir = python_portable_string(dest_dir)
+
   file_io.recursive_create_dir(dest_dir)
   for file_name in file_io.list_directory(src_dir):
     old_path = os.path.join(src_dir, file_name)
@@ -252,7 +248,9 @@ def export_fn(estimator, export_dir_base, checkpoint_path=None, eval_result=None
           gfile.Copy(source, dest_absolute)
 
     # only keep the last 3 models
-    saved_model_export_utils.garbage_collect_exports(export_dir_base, exports_to_keep=3)
+    saved_model_export_utils.garbage_collect_exports(
+        python_portable_string(export_dir_base),
+        exports_to_keep=3)
 
     # save the last model to the model folder.
     # export_dir_base = A/B/intermediate_models/
@@ -482,7 +480,8 @@ def preprocess_input(features, target, train_config, preprocess_output_dir,
                          (NUMERICAL_ANALYSIS, preprocess_output_dir))
 
       numerical_anlysis = json.loads(
-          file_io.read_file_to_string(numerical_analysis_file))
+          python_portable_string(
+              file_io.read_file_to_string(numerical_analysis_file)))
 
       for name in train_config['numerical_columns']:
         if name == target_name or name == key_name:
@@ -671,7 +670,8 @@ def get_vocabulary(preprocess_output_dir, name):
     raise ValueError('File %s not found in %s' %
                      (CATEGORICAL_ANALYSIS % name, preprocess_output_dir))
 
-  labels = file_io.read_file_to_string(vocab_file).split('\n')
+  labels = python_portable_string(
+      file_io.read_file_to_string(vocab_file)).split('\n')
   label_values = [x for x in labels if x]  # remove empty lines
 
   return label_values
@@ -709,10 +709,13 @@ def merge_metadata(preprocess_output_dir, transforms_file):
                                         NUMERICAL_ANALYSIS)
   schema_file = os.path.join(preprocess_output_dir, SCHEMA_FILE)
 
-  numerical_anlysis = json.loads(file_io.read_file_to_string(
-      numerical_anlysis_file))
-  schema = json.loads(file_io.read_file_to_string(schema_file))
-  transforms = json.loads(file_io.read_file_to_string(transforms_file))
+  numerical_anlysis = json.loads(
+      python_portable_string(
+          file_io.read_file_to_string(numerical_anlysis_file)))
+  schema = json.loads(
+      python_portable_string(file_io.read_file_to_string(schema_file)))
+  transforms = json.loads(
+      python_portable_string(file_io.read_file_to_string(transforms_file)))
 
   result_dict = {}
   result_dict['csv_header'] = [col_schema['name'] for col_schema in schema]
@@ -725,7 +728,7 @@ def merge_metadata(preprocess_output_dir, transforms_file):
   result_dict['vocab_stats'] = {}
 
   # get key column.
-  for name, trans_config in transforms.iteritems():
+  for name, trans_config in six.iteritems(transforms):
     if trans_config.get('transform', None) == 'key':
       result_dict['key_column'] = name
       break
@@ -734,7 +737,7 @@ def merge_metadata(preprocess_output_dir, transforms_file):
 
   # get target column.
   result_dict['target_column'] = schema[0]['name']
-  for name, trans_config in transforms.iteritems():
+  for name, trans_config in six.iteritems(transforms):
     if trans_config.get('transform', None) == 'target':
       result_dict['target_column'] = name
       break
@@ -756,7 +759,7 @@ def merge_metadata(preprocess_output_dir, transforms_file):
       raise ValueError('Unsupported schema type %s' % col_type)
 
   # Get the transforms.
-  for name, trans_config in transforms.iteritems():
+  for name, trans_config in six.iteritems(transforms):
     if name != result_dict['target_column'] and name != result_dict['key_column']:
       result_dict['transforms'][name] = trans_config
 
@@ -849,3 +852,22 @@ def is_regression_model(model_type):
 
 def is_classification_model(model_type):
   return model_type.endswith('_classification')
+
+
+# Note that this function exists in google.datalab.utils, but that is not
+# installed on the training workers.
+def python_portable_string(string, encoding='utf-8'):
+  """Converts bytes into a string type.
+
+  Valid string types are retuned without modification. So in Python 2, type str
+  and unicode are not converted.
+
+  In Python 3, type bytes is converted to type str (unicode)
+  """
+  if isinstance(string, six.string_types):
+    return string
+
+  if six.PY3:
+    return string.decode(encoding)
+
+  raise ValueError('Unsupported type %s' % str(type(string)))
diff --git a/solutionbox/structured_data/test_mltoolbox/e2e_functions.py b/solutionbox/structured_data/test_mltoolbox/e2e_functions.py
@@ -17,6 +17,7 @@
 import os
 import random
 import json
+import six
 import subprocess
 
 
@@ -202,6 +203,10 @@ def run_training(
   logger.debug('Going to run command: %s' % ' '.join(cmd))
   sp = subprocess.Popen(' '.join(cmd), shell=True, stderr=subprocess.PIPE)
   _, err = sp.communicate()
+
+  if not six.PY2:
+    err = err.decode()
+
   return err
 
 
diff --git a/solutionbox/structured_data/test_mltoolbox/test_datalab_e2e.py b/solutionbox/structured_data/test_mltoolbox/test_datalab_e2e.py
@@ -178,8 +178,10 @@ def test_e2e(self):
     try:
       self._make_test_files()
       self._run_analyze()
+      self._run_train()
       if six.PY2:
-        self._run_train()
+        # Dataflow is only supported by python 2. Prediction assumes Dataflow
+        # is installed.
         self._run_predict()
         self._run_batch_prediction(
             os.path.join(self._batch_predict_output, 'with_target'),
diff --git a/tests/main.py b/tests/main.py
@@ -38,9 +38,9 @@
 import kernel.html_tests
 import kernel.storage_tests
 import kernel.utils_tests
-import mltoolbox_structured_data.traininglib_tests
-import mltoolbox_structured_data.sd_e2e_tests
 import mltoolbox_structured_data.dl_interface_tests
+import mltoolbox_structured_data.sd_e2e_tests
+import mltoolbox_structured_data.traininglib_tests
 import stackdriver.commands.monitoring_tests
 import stackdriver.monitoring.group_tests
 import stackdriver.monitoring.metric_tests
@@ -78,6 +78,7 @@
     kernel.utils_tests,
     mltoolbox_structured_data.dl_interface_tests,
     mltoolbox_structured_data.sd_e2e_tests,  # Not everything runs in Python 3.
+    mltoolbox_structured_data.traininglib_tests,
     stackdriver.commands.monitoring_tests,
     stackdriver.monitoring.group_tests,
     stackdriver.monitoring.metric_tests,
@@ -93,11 +94,6 @@
     _util.util_tests
 ]
 
-# mltoolbox is not part of the datalab install, but it should still be tested.
-# mltoolbox does not work with python 3.
-if sys.version_info.major == 2:
-  _TEST_MODULES.append(mltoolbox_structured_data.traininglib_tests)
-
 if __name__ == '__main__':
   suite = unittest.TestSuite()
   for m in _TEST_MODULES: