microsoft · ganik · Feb 25, 2020 · Feb 24, 2020
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
@@ -754,6 +754,8 @@
     <Compile Include="nimbusml\timeseries\ssaspikedetector.py" />
     <Compile Include="nimbusml\timeseries\timeseriesimputer.py" />
     <Compile Include="nimbusml\timeseries\__init__.py" />
+    <Compile Include="tests_extended\data_frame_tool.py" />
+    <Compile Include="tests_extended\vinod.py" />
     <Compile Include="tests_extended\test_export_to_onnx.py" />
     <Compile Include="tests\test_estimator_checks.py" />
     <Compile Include="nimbusml\tests\feature_extraction\text\test_lightlda.py" />

diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py
@@ -0,0 +1,192 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+from datetime import datetime
+import numpy as np
+import pandas as pd
+import onnxruntime as onnxrt
+
+ort_float_set = set([np.float32, np.float64])
+
+pd_float_set = set(['float64'])
+
+ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64])
+
+pd_int_set = set(['int64'])
+
+types_dict = {
+    'tensor(float16)': np.float16,
+    'tensor(float)'  : np.float32,
+    'tensor(double)' : np.float64,
+
+    'tensor(int8)'   : np.int8,
+    'tensor(uint8)'  : np.uint8,
+    'tensor(int16)'  : np.int16,
+    'tensor(uint16)' : np.uint16,
+    'tensor(int32)'  : np.int32,
+    'tensor(uint32)' : np.uint32,
+    'tensor(int64)'  : np.int64,
+    'tensor(uint64)' : np.uint64,
+
+    'tensor(bool)'   : np.bool,
+    'tensor(string)' : np.object
+}
+
+class DataFrameTool():
+    """
+    This is a utility class used to run a model with pandas.DataFrame input
+    """
+    def __init__(self, model_path, sess_options=None):
+        """
+        :param model_path: path to the model to be loaded
+        :param sess_options: see onnxruntime.SessionsOptions
+        """
+        self._model_path = model_path
+        self._sess_options = sess_options
+        self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options)
+
+    def _reshape_input(self, input_array, expected_shape):
+        """
+        :param - input_array numpy array. This one is obtained from DataFrame and expected to have
+        :      a rank if 1.
+        :expected_shape - shape fetched from the model which may include dynamic elements.
+        :  expected_shape may at most have one -1, None or zero which will be computed from
+        :  the size of the input_array. We replace None and zeros to -1 and let np.ndarray.reshape deal with it.
+        """
+        # expected_shape rank is one, we will let onnxruntime to deal with it
+        if len(expected_shape) == 1:
+            return input_array
+
+        inferred_shape = [dim if dim else -1 for dim in expected_shape]
+        return input_array.reshape(inferred_shape)
+
+    def _validate_type(self, input_meta, col_type):
+        """
+        : input_meta - meta info obtained from the model for the given input
+        : col_type - dtype of the column
+        : throws if conditions are not met
+
+         float16 and bool will always require exact match
+         We attempt to convert any type to a string if it is required.
+         With strings we always want to put this into a flat array, cast to np.object and then reshape as object
+         Any other type to qualify for casting must match either integer or floating point types
+         Python datetime which is denoted in Pandas as datetime64[ns] are cast to int64
+        """
+        expected_type = types_dict[input_meta.type]
+        if input_meta.type == 'tensor(string)':
+           return
+        elif expected_type == col_type:
+           return
+        elif expected_type == np.int64 and str(col_type) == 'datetime64[ns]':
+           return
+        elif expected_type in ort_float_set and str(col_type) in pd_float_set:
+           return
+        elif expected_type in ort_int_set and str(col_type) in pd_int_set:
+           return
+
+        raise TypeError("Input {} requires type {} unable to cast column type {} ".format(
+            input_meta.name, expected_type, col_type))
+
+
+    def _process_input_list(self, df, input_metas, require):
+        """
+        Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta
+        The function does the heavy lifting for _get_input_feeds()
+
+        :param df: See :class:`pandas.DataFrame`. 
+        :param input_metas: a list of name/type pairs
+        :require is a boolean. If True this helper throws on a missing input.
+
+        """
+        feeds = {}
+        # Process mandadory inputs. Raise an error if anything is not present
+        for input_meta in input_metas:
+            # We fully expect all the types are in the above dictionary
+            assert input_meta.type in types_dict, "Update types_dict for the new type"
+            if input_meta.name in df.columns:
+                self._validate_type(input_meta, df[input_meta.name].dtype)
+                if (df[input_meta.name].dtype) == 'datetime64[ns]':
+                    input_array = np.array([dt.timestamp() for dt in df[input_meta.name]]).astype(np.int64)
+                else:
+                    # With strings we must cast first to np.object then then reshape
+                    # so we do it for everything
+                    input_array = np.array(df[input_meta.name]).astype(types_dict[input_meta.type])
+
+                feeds[input_meta.name] = self._reshape_input(input_array, input_meta.shape)
+
+            elif require:
+                raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format(
+                               input_meta.name, types_dict[input_meta.type]))
+        return feeds
+
+
+    def _get_input_feeds(self, df, sess):
+        """
+        Return a dictionary of input_name : a typed and shaped np.array of values
+        This function accepts Pandas DataFrame as the first argument and onnxruntime
+        session with a loaded model. The function interrogates the model for the inputs
+        and matches the model input names to the DataFrame instance column names.
+        It requires exact matches for bool and float16 types. It attempts to convert to
+        string any input type if string is required.
+        It attempts to convert floating types to each other and does the same for all of the
+        integer types without requiring an exact match.
+
+        :param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column
+            and feeds the data to the appropriate model inputs.
+
+        :param sess: See :class:`onnxruntime.InferenceSession`.
+
+        ::
+        For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C'])
+
+        """
+        if df.empty:
+            raise RuntimeError('input DataFrame is empty')
+
+        # Process mandadory inputs. Raise an error if anything is not present
+        feeds = self._process_input_list(df, sess.get_inputs(), True)
+        # Process optional overridable initializers. If present the initialzier value
+        # is overriden by the input. If not, the initialzier value embedded in the model takes effect.
+        initializers = self._process_input_list(df, sess.get_overridable_initializers(), False)
+
+        feeds.update(initializers)
+
+        return feeds
+
+    def execute(self, df, output_names=None, output_types=None, run_options=None):
+        "Return a list of output values restricted to output names if not empty"
+        """
+        Compute the predictions.
+
+        :param df: See :class:`pandas.DataFrame`.
+        :output_name - list of column output names and their order to output
+        :output_types { output_name : dtype } optional dictionary that asks to cast output
+           to the colum type
+
+        :param run_options: See :class:`onnxruntime.RunOptions`.
+        ::
+        sess.run([output_name], {input_name: x})
+        Pandas DataFrame
+        """
+        input_feed = self._get_input_feeds(df, self._sess);
+        if not output_names:
+          output_names = [output.name for output in self._sess._outputs_meta]
+
+        results = self._sess.run(output_names, input_feed, run_options)
+
+        df = pd.DataFrame()
+        for i in range(len(results)):
+           r = results[i].flatten()
+           if output_types and output_names[i] in output_types:
+             dtype = output_types[output_names[i]]
+             if dtype == np.dtype('datetime64'):
+                r = r.astype(np.int64)
+                r = [datetime.utcfromtimestamp(ts) for ts in r]
+             else:
+                r = r.astype(dtype)
+
+           df[output_names[i]] = r
+
+        return df
diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py
@@ -43,6 +43,7 @@
 from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector,
                                  SsaSpikeDetector, SsaChangePointDetector,
                                  SsaForecaster)
+from data_frame_tool import DataFrameTool as DFT
 
 
 SHOW_ONNX_JSON = False
@@ -559,6 +560,8 @@ def test_export_to_onnx(estimator, class_name):
         try:
             onnxrunner = OnnxRunner(model_file=onnx_path)
             result_onnx = onnxrunner.fit_transform(dataset)
+            df_tool = DFT(onnx_path)
+            result_onnx1 = df_tool.execute(dataset, [])
 
             if SHOW_TRANSFORMED_RESULTS:
                 print_results(result_expected, result_onnx)
@@ -590,8 +593,8 @@ def test_export_to_onnx(estimator, class_name):
 for entry_point in entry_points:
     class_name = entry_point['NewName']
 
-#    if not class_name in ['Handler']:
-#        continue
+    if not class_name in ['Handler']:
+        continue
 
     print('\n===========> %s' % class_name)
 

diff --git a/src/python/tests_extended/vinod.py b/src/python/tests_extended/vinod.py
@@ -0,0 +1,63 @@
+import os
+import tempfile
+import nimbusml.linear_model as nml_linear
+from nimbusml.feature_extraction.categorical import OneHotVectorizer
+from nimbusml.preprocessing.missing_values import Handler
+from nimbusml import FileDataStream
+from nimbusml.preprocessing import DatasetTransformer
+from nimbusml import Pipeline
+from nimbusml.preprocessing import OnnxRunner
+
+def get_tmp_file(suffix=None):
+    fd, file_name = tempfile.mkstemp(suffix=suffix)
+    fl = os.fdopen(fd, 'w')
+    fl.close()
+    return file_name
+
+X_train_dprep = FileDataStream.read_csv("E:/sources/vinod/NYCTaxiTipPrediction_train.csv")
+X_test_dprep = FileDataStream.read_csv("E:/sources/vinod/NYCTaxiTipPrediction_valid.csv")
+
+try:
+    pipe_featurization = Pipeline([OneHotVectorizer(columns={'vendor_id': 'vendor_id', 'payment_type': 'payment_type', 'passenger_count': 'passenger_count','rate_code': 'rate_code'})
+                                      ,Handler(columns={'trip_distance': 'trip_distance', 'trip_time_in_secs': 'trip_time_in_secs'})
+    ])
+    pipe_featurization.fit(X_train_dprep)
+
+    pipe_training = Pipeline([DatasetTransformer(pipe_featurization.model),
+                nml_linear.FastLinearRegressor(feature=['vendor_id', 'payment_type', 'passenger_count', 'rate_code', 'trip_distance', 'trip_time_in_secs'],label='fare_amount')
+    ])
+    pipe_training.fit(X_train_dprep)
+
+    metrics, scores = pipe_training.test(X_test_dprep)
+    print(metrics)
+    print('training done')
+
+    # Export the pipeline to ONNX
+    onnx_path = get_tmp_file('.onnx')
+    pipe_training.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable')
+    print('export done')
+
+    # Perform the transform using the standard ML.Net backend
+    result_standard = pipe_training.transform(X_test_dprep)
+    print(result_standard)
+    print('done transform using standard backend')
+    #          c1        c2
+    # 0  0.025025  0.000998
+    # 1  0.305305  0.000998
+
+    # Perform the transform using the ONNX backend.
+    # Note, the extra columns and column name differences
+    # is a known issue with the ML.Net backend.
+    onnxrunner = OnnxRunner(model_file=onnx_path)
+    result_onnx = onnxrunner.fit_transform(X_test_dprep)
+    print('done transform using onnx backend')
+    print(result_onnx)
+    #      c1   c2     c12.0     c22.0
+    # 0   2.5  1.0  0.025025  0.000998
+    # 1  30.5  1.0  0.305305  0.000998
+
+except Exception as e:
+    print('tragedy')
+    print(e)
+
+print ("done")