Frontend Support for Sagemaker-Sklearn-Extension Models Part I (neo-ai#145)

CloudManX · Ubuntu · Ubuntu · Trevor Morris · commit e41640da847f · 2020-10-13T18:44:32.000Z
* tmp commit

* tmp checkpoint

* checkpoint

* add auto_ml frontend parser

* +registration of shapefunc for isnan and isinf, enable dyn tiling in robust imputer

* tmp

* unit tests for robustImputer, thresholdOneHotEncoder, robustStandardScaler and ColumnTransformer

* Add ASF header

* docker sklearn installation

* docker installation of Sklearn

* typo fixes

* documentation fixes and error handling when sklearn is not installed

Co-authored-by: Ubuntu &lt;ubuntu@ip-172-31-28-239.us-west-2.compute.internal&gt;
Co-authored-by: Ubuntu &lt;ubuntu@ip-172-31-72-105.ec2.internal&gt;
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
@@ -80,6 +80,10 @@ RUN bash /install/ubuntu_install_arm_compute_lib.sh
 COPY install/ubuntu_install_caffe.sh /install/ubuntu_install_caffe.sh
 RUN bash /install/ubuntu_install_caffe.sh
 
+# Sagemaker-Sklearn-Extension deps
+COPY install/ubuntu_install_sklearn.sh /install/ubuntu_install_sklearn.sh
+RUN bash /install/ubuntu_install_sklearn.sh
+
 # Github Arm(R) Ethos(TM)-N NPU driver
 COPY install/ubuntu_install_ethosn_driver_stack.sh /install/ubuntu_install_ethosn_driver_stack.sh
 RUN bash /install/ubuntu_install_ethosn_driver_stack.sh
diff --git a/docker/install/ubuntu_install_sklearn.sh b/docker/install/ubuntu_install_sklearn.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+# install the latest version of Sklearn and Sagemaker-Scikit-Learn-Extension
+pip3 install sklearn
+pip3 install sagemaker-scikit-learn-extension
diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
@@ -34,3 +34,5 @@
 from .darknet import from_darknet
 from .pytorch import from_pytorch
 from .caffe import from_caffe
+from .sklearn import from_sklearn
+from .sklearn import from_auto_ml
diff --git a/python/tvm/relay/frontend/sklearn.py b/python/tvm/relay/frontend/sklearn.py
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines
+# pylint: disable=import-outside-toplevel
+
+import numpy as np
+import tvm
+from tvm.ir import IRModule
+
+from ... import nd as _nd
+from .. import analysis
+from .. import expr as _expr
+from .. import function as _function
+from .. import op as _op
+from .. import vision as _vision
+
+from ..function import Function
+from ..expr import Call, Let
+from ..expr import If, Tuple, TupleGetItem
+from ..expr import RefCreate, RefRead, RefWrite
+from ..expr_functor import ExprFunctor
+from ..adt import Match, Clause
+
+from .common import AttrCvt, Renamer, ExprTable
+from .common import get_relay_op, new_var, infer_shape, infer_channels
+from .common import infer_type, get_name
+from .common import infer_value as _infer_value
+from .common import infer_value_simulated as _infer_value_simulated
+
+
+def _SimpleImputer(op, inexpr, dshape, dtype, columns=None):
+    """
+    Scikit-Learn Transformer: 
+    Imputation transformer for completing missing values.
+    """
+    boolean_mask = _op.isnan(inexpr)
+    fill_col = _op.const(np.array(op.statistics_, dtype=dtype))
+    input_shape = _op.shape_of(inexpr)
+    reps = _op.take(input_shape, _op.const([0]))
+    reps = _op.concatenate([reps, _op.const([1])], axis=0)
+
+    fill_val = _op.tile(fill_col, reps=reps)
+    indices =_op.const(np.arange(len(op.statistics_)))
+    fill_val = _op.take(fill_val, indices=indices, axis=1)
+
+    ret = _op.where(boolean_mask,
+                    fill_val,
+                    inexpr)
+    
+    return ret
+
+def _RobustImputer(op, inexpr, dshape, dtype, columns=None):
+    """
+    Sagemaker-Scikit-Learn-Extension Transformer: 
+    Imputation transformer for completing missing values with multi-column support.
+    """
+    if columns: 
+        column_indices = _op.const(columns)
+        inexpr = _op.take(inexpr, indices=column_indices, axis=1)
+
+    if op.mask_function is not None:
+        inf_mask = _op.isinf(inexpr)
+        nan_val = _op.full_like(inexpr, _op.const(np.array(np.nan, dtype=dtype)))
+        inexpr = _op.where(inf_mask, nan_val, inexpr) 
+    ret = _SimpleImputer(op.simple_imputer_, inexpr, dshape, dtype, columns)
+
+    return ret 
+    
+def _ThresholdOneHotEncoder(op, inexpr, dshape, dtype, columns=None):
+    """
+    Sagemaker-Scikit-Learn-Extension Transformer: 
+    Encode categorical integer features as a one-hot numeric array, with optional restrictions on
+    feature encoding.
+    """
+    if columns: 
+        column_indices = _op.const(columns)
+        inexpr = _op.take(inexpr, indices=column_indices, axis=1)
+
+    num_cat = len(op.categories_)
+    cols = _op.split(inexpr, num_cat, axis=1)
+
+    out = [] 
+    for i in range(num_cat):
+        category = op.categories_[i]
+        cat_tensor = _op.const(np.array(category, dtype=dtype))
+        tiled_col = _op.tile(cols[i], (1, len(category)))
+        one_hot_mask = _op.equal(tiled_col, cat_tensor)
+        one_hot = _op.cast(one_hot_mask, dtype)
+        out.append(one_hot)
+
+    ret = _op.concatenate(out, axis=1) 
+    return ret
+
+def _RobustStandardScaler(op, inexpr, dshape, dtype, columns=None):
+    """
+    Sagemaker-Scikit-Learn-Extension Transformer: 
+    Standardize features by removing the mean and scaling to unit variance
+    """
+    scaler = op.scaler_
+    ret = _op.subtract(inexpr, _op.const(np.array(scaler.mean_, dtype), dtype))
+    ret = _op.divide(ret, _op.const(np.array(scaler.scale_, dtype), dtype))
+    return ret
+
+def _ColumnTransformer(op, inexpr, dshape, dtype, columns=None):
+    """
+    Scikit-Learn Compose: 
+    Applies transformers to columns of an array 
+    """
+    out = []
+    for _, pipe, cols in op.transformers_:
+        mod = pipe.steps[0][1]
+        out.append(sklearn_op_to_relay(mod, inexpr, dshape, dtype, cols))
+    
+    return _op.concatenate(out, axis=1)
+
+_convert_map = {
+    'ColumnTransformer':_ColumnTransformer,
+    'SimpleImputer': _SimpleImputer,
+    'RobustImputer': _RobustImputer,
+    'RobustStandardScaler': _RobustStandardScaler,
+    'ThresholdOneHotEncoder': _ThresholdOneHotEncoder
+}
+
+def sklearn_op_to_relay(op, inexpr, dshape, dtype, columns=None):
+    classname = type(op).__name__
+    return _convert_map[classname](op, inexpr, dshape, dtype, columns)
+
+def from_sklearn(model,
+                 shape=None,
+                 dtype="float32",
+                 columns=None):
+
+    try:
+        import sklearn
+    except ImportError as e:
+        raise ImportError(
+            "Unable to import scikit-learn which is required {}".format(e))
+    
+    inexpr = _expr.var('input', shape=shape, dtype=dtype)
+    outexpr = sklearn_op_to_relay(model, inexpr, shape, dtype, columns)
+
+    func = _function.Function(analysis.free_vars(outexpr), outexpr)
+    return IRModule.from_expr(func), []
+
+def from_auto_ml(model,
+                shape=None,
+                dtype="float32"):
+
+    try:
+        import sklearn
+    except ImportError as e:
+        raise ImportError(
+            "Unable to import scikit-learn which is required {}".format(e))
+
+    outexpr = _expr.var('input', shape=shape, dtype=dtype)
+    for _, transformer in model.feature_transformer.steps:
+        outexpr = sklearn_op_to_relay(transformer, outexpr, shape, dtype, None)
+
+    func = _function.Function(analysis.free_vars(outexpr), outexpr)
+    return IRModule.from_expr(func), []
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
@@ -272,3 +272,7 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("clip", False, elemwise_shape_func)
 register_shape_func("log2", False, elemwise_shape_func)
 register_shape_func("sigmoid", False, elemwise_shape_func)
+register_shape_func("isnan", False, elemwise_shape_func)
+register_shape_func("isinf", False, elemwise_shape_func)
+register_shape_func("where", False, elemwise_shape_func)
+
diff --git a/tests/python/frontend/sklearn/test_forward.py b/tests/python/frontend/sklearn/test_forward.py
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.compose import ColumnTransformer
+from sagemaker_sklearn_extension.impute import RobustImputer
+from sagemaker_sklearn_extension.preprocessing import RobustStandardScaler
+from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder
+
+from tvm import topi
+import tvm.topi.testing
+import tvm
+from tvm import te
+from tvm import relay
+from tvm.contrib import graph_runtime
+import scipy
+
+class SklearnTestHelper:
+    def __init__(self, target='llvm', ctx=tvm.cpu(0)):
+        self.compiled_model = None
+        self.target = target
+        self.ctx = ctx
+
+    def compile(self, model, dshape, dtype, columns=None, auto_ml=False):
+        if auto_ml:
+            mod, _ = relay.frontend.from_auto_ml(model, dshape, dtype)
+        else:
+            mod, _ = relay.frontend.from_sklearn(model, dshape, dtype, columns)
+
+        self.ex = relay.create_executor('vm', mod=mod, ctx=self.ctx, target=self.target)
+        
+    def run(self, data):
+       result = self.ex.evaluate()(data)
+       return result.asnumpy()
+
+def _test_model_impl(helper, model, dshape, input_data):
+    helper.compile(model, dshape, 'float32')
+    sklearn_out = model.transform(input_data)
+    tvm_out = helper.run(input_data)
+    tvm.testing.assert_allclose(sklearn_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_simple_imputer():
+    st_helper = SklearnTestHelper()
+    data = np.array([[4, 5, np.nan, 7], [0, np.nan, 2, 3], [8, 9, 10, 11], [np.nan, 13, 14, 15]],
+                     dtype=np.float32)
+
+    imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
+    imp_mean.fit(data)
+
+    dshape = (relay.Any(), len(data[0]))
+    _test_model_impl(st_helper, imp_mean, dshape, data)
+
+def test_robust_imputer():
+    st_helper = SklearnTestHelper()
+    data = np.array([[4, 5, np.nan, 7], [0, np.nan, 2, 3], [8, 9, 10, 11], [np.nan, 13, 14, 15]],
+                     dtype=np.float32)
+
+    ri = RobustImputer(dtype=None, strategy="constant", fill_values=np.nan, mask_function=None)
+    ri.fit(data)
+
+    dshape = (relay.Any(), len(data[0]))
+    _test_model_impl(st_helper, ri, dshape, data)
+
+def test_robust_scaler():
+    st_helper = SklearnTestHelper()
+    rss = RobustStandardScaler()
+
+    data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]], dtype=np.float32)
+    rss.fit(data)
+
+    dshape = (relay.Any(), len(data[0]))
+    _test_model_impl(st_helper, rss, dshape, data)
+
+def test_threshold_onehot_encoder():
+    st_helper = SklearnTestHelper()
+    tohe = ThresholdOneHotEncoder()
+
+    data = np.array([[10, 1, 7], [11, 3, 8], [11, 2, 9]], dtype=np.int32)
+    tohe.fit(data)
+    tohe.categories_ = [[10, 11], [1, 2, 3], [7, 8, 9]]
+    
+    dshape = (relay.Any(), len(data[0]))
+    st_helper.compile(tohe, dshape, 'int32')
+    sklearn_out = tohe.transform(data).toarray()
+    tvm_out = st_helper.run(data)
+    tvm.testing.assert_allclose(sklearn_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_column_transfomer():
+    st_helper = SklearnTestHelper()
+
+    data = np.array([[4, 5, np.nan, 7], [0, np.nan, 2, 3], [8, 9, 10, 11], [np.nan, 13, 14, 15]],
+                     dtype=np.float32)
+
+    pipeline = Pipeline(steps=[('robustimputer', 
+                                RobustImputer(fill_values=np.nan, strategy='constant'))])
+    ct = ColumnTransformer(transformers=[('numeric_processing', pipeline, [0, 1, 2, 3])])
+    ct.fit(data)
+
+    dshape = (relay.Any(), relay.Any())
+    _test_model_impl(st_helper, ct, dshape, data)
+
+
+if __name__ == '__main__':
+    test_simple_imputer()
+    test_robust_imputer()
+    test_robust_scaler()
+    test_column_transfomer()
+    test_threshold_onehot_encoder()