Skip to content
This repository was archived by the owner on Nov 16, 2023. It is now read-only.

Add ORT & vinod script #449

Merged
merged 1 commit into from
Feb 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/python/nimbusml.pyproj
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,8 @@
<Compile Include="nimbusml\timeseries\ssaspikedetector.py" />
<Compile Include="nimbusml\timeseries\timeseriesimputer.py" />
<Compile Include="nimbusml\timeseries\__init__.py" />
<Compile Include="tests_extended\data_frame_tool.py" />
<Compile Include="tests_extended\vinod.py" />
<Compile Include="tests_extended\test_export_to_onnx.py" />
<Compile Include="tests\test_estimator_checks.py" />
<Compile Include="nimbusml\tests\feature_extraction\text\test_lightlda.py" />
Expand Down
192 changes: 192 additions & 0 deletions src/python/tests_extended/data_frame_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#-------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------

from datetime import datetime
import numpy as np
import pandas as pd
import onnxruntime as onnxrt

ort_float_set = set([np.float32, np.float64])

pd_float_set = set(['float64'])

ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64])

pd_int_set = set(['int64'])

types_dict = {
'tensor(float16)': np.float16,
'tensor(float)' : np.float32,
'tensor(double)' : np.float64,

'tensor(int8)' : np.int8,
'tensor(uint8)' : np.uint8,
'tensor(int16)' : np.int16,
'tensor(uint16)' : np.uint16,
'tensor(int32)' : np.int32,
'tensor(uint32)' : np.uint32,
'tensor(int64)' : np.int64,
'tensor(uint64)' : np.uint64,

'tensor(bool)' : np.bool,
'tensor(string)' : np.object
}

class DataFrameTool():
"""
This is a utility class used to run a model with pandas.DataFrame input
"""
def __init__(self, model_path, sess_options=None):
"""
:param model_path: path to the model to be loaded
:param sess_options: see onnxruntime.SessionsOptions
"""
self._model_path = model_path
self._sess_options = sess_options
self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options)

def _reshape_input(self, input_array, expected_shape):
"""
:param - input_array numpy array. This one is obtained from DataFrame and expected to have
: a rank if 1.
:expected_shape - shape fetched from the model which may include dynamic elements.
: expected_shape may at most have one -1, None or zero which will be computed from
: the size of the input_array. We replace None and zeros to -1 and let np.ndarray.reshape deal with it.
"""
# expected_shape rank is one, we will let onnxruntime to deal with it
if len(expected_shape) == 1:
return input_array

inferred_shape = [dim if dim else -1 for dim in expected_shape]
return input_array.reshape(inferred_shape)

def _validate_type(self, input_meta, col_type):
"""
: input_meta - meta info obtained from the model for the given input
: col_type - dtype of the column
: throws if conditions are not met

float16 and bool will always require exact match
We attempt to convert any type to a string if it is required.
With strings we always want to put this into a flat array, cast to np.object and then reshape as object
Any other type to qualify for casting must match either integer or floating point types
Python datetime which is denoted in Pandas as datetime64[ns] are cast to int64
"""
expected_type = types_dict[input_meta.type]
if input_meta.type == 'tensor(string)':
return
elif expected_type == col_type:
return
elif expected_type == np.int64 and str(col_type) == 'datetime64[ns]':
return
elif expected_type in ort_float_set and str(col_type) in pd_float_set:
return
elif expected_type in ort_int_set and str(col_type) in pd_int_set:
return

raise TypeError("Input {} requires type {} unable to cast column type {} ".format(
input_meta.name, expected_type, col_type))


def _process_input_list(self, df, input_metas, require):
"""
Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta
The function does the heavy lifting for _get_input_feeds()

:param df: See :class:`pandas.DataFrame`.
:param input_metas: a list of name/type pairs
:require is a boolean. If True this helper throws on a missing input.

"""
feeds = {}
# Process mandadory inputs. Raise an error if anything is not present
for input_meta in input_metas:
# We fully expect all the types are in the above dictionary
assert input_meta.type in types_dict, "Update types_dict for the new type"
if input_meta.name in df.columns:
self._validate_type(input_meta, df[input_meta.name].dtype)
if (df[input_meta.name].dtype) == 'datetime64[ns]':
input_array = np.array([dt.timestamp() for dt in df[input_meta.name]]).astype(np.int64)
else:
# With strings we must cast first to np.object then then reshape
# so we do it for everything
input_array = np.array(df[input_meta.name]).astype(types_dict[input_meta.type])

feeds[input_meta.name] = self._reshape_input(input_array, input_meta.shape)

elif require:
raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format(
input_meta.name, types_dict[input_meta.type]))
return feeds


def _get_input_feeds(self, df, sess):
"""
Return a dictionary of input_name : a typed and shaped np.array of values
This function accepts Pandas DataFrame as the first argument and onnxruntime
session with a loaded model. The function interrogates the model for the inputs
and matches the model input names to the DataFrame instance column names.
It requires exact matches for bool and float16 types. It attempts to convert to
string any input type if string is required.
It attempts to convert floating types to each other and does the same for all of the
integer types without requiring an exact match.

:param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column
and feeds the data to the appropriate model inputs.

:param sess: See :class:`onnxruntime.InferenceSession`.

::
For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C'])

"""
if df.empty:
raise RuntimeError('input DataFrame is empty')

# Process mandadory inputs. Raise an error if anything is not present
feeds = self._process_input_list(df, sess.get_inputs(), True)
# Process optional overridable initializers. If present the initialzier value
# is overriden by the input. If not, the initialzier value embedded in the model takes effect.
initializers = self._process_input_list(df, sess.get_overridable_initializers(), False)

feeds.update(initializers)

return feeds

def execute(self, df, output_names=None, output_types=None, run_options=None):
"Return a list of output values restricted to output names if not empty"
"""
Compute the predictions.

:param df: See :class:`pandas.DataFrame`.
:output_name - list of column output names and their order to output
:output_types { output_name : dtype } optional dictionary that asks to cast output
to the colum type

:param run_options: See :class:`onnxruntime.RunOptions`.
::
sess.run([output_name], {input_name: x})
Pandas DataFrame
"""
input_feed = self._get_input_feeds(df, self._sess);
if not output_names:
output_names = [output.name for output in self._sess._outputs_meta]

results = self._sess.run(output_names, input_feed, run_options)

df = pd.DataFrame()
for i in range(len(results)):
r = results[i].flatten()
if output_types and output_names[i] in output_types:
dtype = output_types[output_names[i]]
if dtype == np.dtype('datetime64'):
r = r.astype(np.int64)
r = [datetime.utcfromtimestamp(ts) for ts in r]
else:
r = r.astype(dtype)

df[output_names[i]] = r

return df
7 changes: 5 additions & 2 deletions src/python/tests_extended/test_export_to_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector,
SsaSpikeDetector, SsaChangePointDetector,
SsaForecaster)
from data_frame_tool import DataFrameTool as DFT


SHOW_ONNX_JSON = False
Expand Down Expand Up @@ -559,6 +560,8 @@ def test_export_to_onnx(estimator, class_name):
try:
onnxrunner = OnnxRunner(model_file=onnx_path)
result_onnx = onnxrunner.fit_transform(dataset)
df_tool = DFT(onnx_path)
result_onnx1 = df_tool.execute(dataset, [])

if SHOW_TRANSFORMED_RESULTS:
print_results(result_expected, result_onnx)
Expand Down Expand Up @@ -590,8 +593,8 @@ def test_export_to_onnx(estimator, class_name):
for entry_point in entry_points:
class_name = entry_point['NewName']

# if not class_name in ['Handler']:
# continue
if not class_name in ['Handler']:
continue

print('\n===========> %s' % class_name)

Expand Down
63 changes: 63 additions & 0 deletions src/python/tests_extended/vinod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
import tempfile
import nimbusml.linear_model as nml_linear
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.preprocessing.missing_values import Handler
from nimbusml import FileDataStream
from nimbusml.preprocessing import DatasetTransformer
from nimbusml import Pipeline
from nimbusml.preprocessing import OnnxRunner

def get_tmp_file(suffix=None):
fd, file_name = tempfile.mkstemp(suffix=suffix)
fl = os.fdopen(fd, 'w')
fl.close()
return file_name

X_train_dprep = FileDataStream.read_csv("E:/sources/vinod/NYCTaxiTipPrediction_train.csv")
X_test_dprep = FileDataStream.read_csv("E:/sources/vinod/NYCTaxiTipPrediction_valid.csv")

try:
pipe_featurization = Pipeline([OneHotVectorizer(columns={'vendor_id': 'vendor_id', 'payment_type': 'payment_type', 'passenger_count': 'passenger_count','rate_code': 'rate_code'})
,Handler(columns={'trip_distance': 'trip_distance', 'trip_time_in_secs': 'trip_time_in_secs'})
])
pipe_featurization.fit(X_train_dprep)

pipe_training = Pipeline([DatasetTransformer(pipe_featurization.model),
nml_linear.FastLinearRegressor(feature=['vendor_id', 'payment_type', 'passenger_count', 'rate_code', 'trip_distance', 'trip_time_in_secs'],label='fare_amount')
])
pipe_training.fit(X_train_dprep)

metrics, scores = pipe_training.test(X_test_dprep)
print(metrics)
print('training done')

# Export the pipeline to ONNX
onnx_path = get_tmp_file('.onnx')
pipe_training.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable')
print('export done')

# Perform the transform using the standard ML.Net backend
result_standard = pipe_training.transform(X_test_dprep)
print(result_standard)
print('done transform using standard backend')
# c1 c2
# 0 0.025025 0.000998
# 1 0.305305 0.000998

# Perform the transform using the ONNX backend.
# Note, the extra columns and column name differences
# is a known issue with the ML.Net backend.
onnxrunner = OnnxRunner(model_file=onnx_path)
result_onnx = onnxrunner.fit_transform(X_test_dprep)
print('done transform using onnx backend')
print(result_onnx)
# c1 c2 c12.0 c22.0
# 0 2.5 1.0 0.025025 0.000998
# 1 30.5 1.0 0.305305 0.000998

except Exception as e:
print('tragedy')
print(e)

print ("done")