From f0a9163d0bed52a7199aaa74d6a967cf13e2e164 Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Thu, 14 Nov 2019 15:01:19 -0800 Subject: [PATCH] Squashed commit of the following: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a5e274ef8869576190bbb794360a5f56d998b470 Merge: b7db4fa47 d7f999653 Author: Keren Fuentes Date: Thu Nov 14 14:51:21 2019 -0800 Merge branch 'onnx_bin_classifiers' of https://github.com/Lynx1820/machinelearning into onnx_bin_classifiers commit b7db4fa475ba4bd52824eb98bbf5f5bf4a0a6f7a Author: Harish Kulkarni Date: Thu Nov 14 17:41:12 2019 +0000 Added onnx export support for KeyToValueMappingTransformer (#4455) commit f3e0f6b3a52f7e3998978239a0f52f35fd6197d8 Author: Eric Erhardt Date: Thu Nov 14 07:22:12 2019 -0600 Fix a flaky Extensions.ML test. (#4458) * Fix a flaky Extensions.ML test. Make the reload model tests more resistant to timing changes. * PR feedback. commit c1e190af4cccd17a81e03f8062a3b1b6fbb9226a Author: Harish Kulkarni Date: Thu Nov 14 05:24:14 2019 +0000 Added onnx export support for OptionalColumnTransform (#4454) * Initial work for adding onnx export support for OptionalColumnTransform * Implemented support for optional initializers in OnnxTranformer to support OptionalColumnTransform * Fixed handling of double values and non-long numeric types * Removed redundant line * Updated review comment commit f96761b3ca55f1ab19584b7195bc19d800e41248 Author: Harish Kulkarni Date: Thu Nov 14 03:17:12 2019 +0000 Fixed model saving and loading of OneVersusAllTrainer to include SoftMax (#4472) * Fixed model saving and loading of OneVersusAllTrainer to include SoftMax * Modified existing test to include SoftMax option * Modified test to verify both cases: when UseSoftmax is true and false commit d45cc8a03f39eded3a0123e9a6c0311507cac2a4 Author: Jake <31937616+JakeRadMSFT@users.noreply.github.com> Date: Wed Nov 13 17:26:49 2019 -0800 Add InternalsVisibleTo in AutoML and CodeGenerator for the assembly Microsoft.ML.ModelBuilder.AutoMLService.Gpu (#4474) commit 5e83e235a9e0b745cafec513656153457f6cb788 Author: Eric Erhardt Date: Wed Nov 13 16:09:05 2019 -0600 CpuMathNative assembly is not getting copied when using packages.config. (#4465) When we refactored CpuMath to support netcoreapp3.0, we broke the packages.config support to copy the native assembly. This fixes it again by copying the file from the correct location. Fix #93 commit 693250b9c636a9a343ff78976c625a81a179109a Author: Harish Kulkarni Date: Wed Nov 13 21:58:07 2019 +0000 Added onnx export support for WordTokenizingTransformer and NgramExtractingTransformer (#4451) * Added onnx export support for string related transforms * Updated baseline test files A large portion of this commit is upgrading the baseline test files. The rest of the fixes deal with build breaks resulting from the upgrade of ORT version. * Fixed bugs in ValueToKeyMappingTransformer and added additional tests commit 59109104e62382c5593da0f4c8c9896e64c677b7 Author: Antonio Velázquez <38739674+antoniovs1029@users.noreply.github.com> Date: Mon Nov 11 17:19:39 2019 -0800 Fixes #4292 about using PFI with BPT and CMPB (#4306) *Changes in PredictionTransformer.cs and Calibrator.cs to fix the problem of the create methods not being called, to make CMP load its internal calibrator and predictor first so to assign the correct paramaters types and runtimes, and added a PredictionTransformerLoadTypeAttribute so that the binary prediction transformer knows what type to assign when loading a CMP as its internal model. *Added a working sample for using PFI with BPT and CMPB while loading a model from disk. This is based entirely in the original sample. *Added file CalibratedModelParametersTests.cs with tests that the CMPs modified in this PR are now being correctly loaded from disk. *Changed a couple of tests in LbfgsTests.cs that failed because they used casts that now return 'null'. commit bcdac553aecf27726e1dc0a53ebc050dc1a429e8 Author: Brian Stark <54910472+bpstark@users.noreply.github.com> Date: Mon Nov 11 13:42:42 2019 -0800 Stabilize the LR test (#4446) * Stabilize the LR test Found issue with how we were using random for our ImageClassificationTrainer. This caused instability in our unit test, as we were not able to control the random seed. Modified the code to now use the same random object throughout, the trainer, thus allowing us to control the seed and therefor have predictable output. commit d7f9996534ce5e33817f81c92d2b277d9becdf88 Author: Keren Fuentes Date: Mon Nov 11 11:33:17 2019 -0800 workaround Scores commit 7fba31c3e8e358469103e61d4cd2b94292bb09a3 Merge: 93388b60b c96d69043 Author: Keren Fuentes Date: Mon Nov 11 11:25:28 2019 -0800 merging changes commit 93388b60b5286b67ad66e6006c3bc9631733a5c0 Author: Keren Fuentes Date: Mon Nov 11 11:19:59 2019 -0800 Added extraction of score column before node creation commit ea71828bdba14af8a0d1d57162fa8c053136c413 Author: Keren Fuentes Date: Fri Nov 8 15:53:11 2019 -0800 fix for binary classification trainers export to onnx commit 6fad293933f6765b0b5444adcc32907f7a47b091 Author: Keren Fuentes Date: Thu Oct 31 15:26:43 2019 -0700 Revert "draft regression test" This reverts commit 1ad45c995516b9d39fc05aca855ce2abe96c407b. commit 83c1c8053902bfc06281a85244b0f3e4a1f1dd50 Author: Keren Fuentes Date: Thu Oct 31 15:24:23 2019 -0700 draft regression test commit 888416176eb946697b157b7c5633d228a64f6b32 Author: frank-dong-ms <55860649+frank-dong-ms@users.noreply.github.com> Date: Fri Nov 8 20:20:53 2019 -0800 nightly build pipeline (#4444) * nightly build pipeline commit c96d6904333bd1f10f9e895713225f69879b9e54 Author: Keren Fuentes Date: Fri Nov 8 15:53:11 2019 -0800 fix for binary classification trainers export to onnx commit 8100364471a41b79186b18d1f29fab1c735b4bb0 Author: Keren Fuentes Date: Thu Oct 31 15:26:43 2019 -0700 Revert "draft regression test" This reverts commit 1ad45c995516b9d39fc05aca855ce2abe96c407b. commit 81381e2714f9e5a2f1435a002946df29fc26790d Author: Keren Fuentes Date: Thu Oct 31 15:24:23 2019 -0700 draft regression test --- .../Scorers/BinaryClassifierScorer.cs | 28 ++++-- src/Microsoft.ML.FastTree/FastTree.cs | 3 +- .../Standard/LinearModelParameters.cs | 6 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 88 ++++++++++++++++++- 4 files changed, 114 insertions(+), 11 deletions(-) diff --git a/src/Microsoft.ML.Data/Scorers/BinaryClassifierScorer.cs b/src/Microsoft.ML.Data/Scorers/BinaryClassifierScorer.cs index 9ef81f9047..0d86606627 100644 --- a/src/Microsoft.ML.Data/Scorers/BinaryClassifierScorer.cs +++ b/src/Microsoft.ML.Data/Scorers/BinaryClassifierScorer.cs @@ -197,14 +197,32 @@ private protected override void SaveAsOnnxCore(OnnxContext ctx) for (int iinfo = 0; iinfo < Bindings.InfoCount; ++iinfo) outColumnNames[iinfo] = Bindings.GetColumnName(Bindings.MapIinfoToCol(iinfo)); - //Check if "Probability" column was generated by the base class, only then - //label can be predicted. + /* If the probability column was generated, then the classification threshold is set to 0.5. Otherwise, + the predicted label is based on the sign of the score. + REVIEW: Binarizer should always have at least two output columns? + */ + string opType = "Binarizer"; + var binarizerOutput = ctx.AddIntermediateVariable(null, "BinarizerOutput", true); + if (Bindings.InfoCount >= 3 && ctx.ContainsColumn(outColumnNames[2])) { - string opType = "Binarizer"; - var node = ctx.CreateNode(opType, new[] { ctx.GetVariableName(outColumnNames[2]) }, - new[] { ctx.GetVariableName(outColumnNames[0]) }, ctx.GetNodeName(opType)); + var node = ctx.CreateNode(opType, ctx.GetVariableName(outColumnNames[2]), binarizerOutput, ctx.GetNodeName(opType)); node.AddAttribute("threshold", 0.5); + + opType = "Cast"; + node = ctx.CreateNode(opType, binarizerOutput, ctx.GetVariableName(outColumnNames[0]), ctx.GetNodeName(opType), ""); + var t = InternalDataKindExtensions.ToInternalDataKind(DataKind.Boolean).ToType(); + node.AddAttribute("to", t); + } + else if (Bindings.InfoCount == 2) + { + var node = ctx.CreateNode(opType, ctx.GetVariableName(outColumnNames[1]), binarizerOutput, ctx.GetNodeName(opType)); + node.AddAttribute("threshold", 0.0); + + opType = "Cast"; + node = ctx.CreateNode(opType, binarizerOutput, ctx.GetVariableName(outColumnNames[0]), ctx.GetNodeName(opType), ""); + var t = InternalDataKindExtensions.ToInternalDataKind(DataKind.Boolean).ToType(); + node.AddAttribute("to", t); } } diff --git a/src/Microsoft.ML.FastTree/FastTree.cs b/src/Microsoft.ML.FastTree/FastTree.cs index 82772ff151..c0bd8e52d2 100644 --- a/src/Microsoft.ML.FastTree/FastTree.cs +++ b/src/Microsoft.ML.FastTree/FastTree.cs @@ -3111,7 +3111,8 @@ bool ISingleCanSaveOnnx.SaveAsOnnx(OnnxContext ctx, string[] outputNames, string } string opType = "TreeEnsembleRegressor"; - var node = ctx.CreateNode(opType, new[] { featureColumn }, outputNames, ctx.GetNodeName(opType)); + string scoreVarName = (Utils.Size(outputNames) == 2) ? outputNames[1] : outputNames[0]; // Get Score from PredictedLabel and/or Score columns + var node = ctx.CreateNode(opType, new[] { featureColumn }, new[] { scoreVarName }, ctx.GetNodeName(opType)); node.AddAttribute("post_transform", PostTransform.None.GetDescription()); node.AddAttribute("n_targets", 1); diff --git a/src/Microsoft.ML.StandardTrainers/Standard/LinearModelParameters.cs b/src/Microsoft.ML.StandardTrainers/Standard/LinearModelParameters.cs index 27d9ad6f25..9f6c45ae4d 100644 --- a/src/Microsoft.ML.StandardTrainers/Standard/LinearModelParameters.cs +++ b/src/Microsoft.ML.StandardTrainers/Standard/LinearModelParameters.cs @@ -240,10 +240,10 @@ JToken ISingleCanSavePfa.SaveAsPfa(BoundPfaContext ctx, JToken input) bool ISingleCanSaveOnnx.SaveAsOnnx(OnnxContext ctx, string[] outputs, string featureColumn) { Host.CheckValue(ctx, nameof(ctx)); - Host.Check(Utils.Size(outputs) == 1); - string opType = "LinearRegressor"; - var node = ctx.CreateNode(opType, new[] { featureColumn }, outputs, ctx.GetNodeName(opType)); + string scoreVarName = (Utils.Size(outputs) == 2) ? outputs[1] : outputs[0]; // Get Score from PredictedLabel and/or Score columns + + var node = ctx.CreateNode(opType, new[] { featureColumn }, new[] { scoreVarName }, ctx.GetNodeName(opType)); // Selection of logit or probit output transform. enum {'NONE', 'LOGIT', 'PROBIT} node.AddAttribute("post_transform", "NONE"); node.AddAttribute("targets", 1); diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 17a04d7fc7..bb7f4ae631 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -132,6 +132,14 @@ private class BreastCancerMulticlassExample [LoadColumn(2, 9), VectorType(8)] public float[] Features; } + private class BreastCancerBinaryClassification + { + [LoadColumn(0)] + public bool Label; + + [LoadColumn(2, 9), VectorType(8)] + public float[] Features; + } [LessThanNetCore30OrNotNetCoreFact("netcoreapp3.0 output differs from Baseline. Tracked by https://github.com/dotnet/machinelearning/issues/2087")] public void KmeansOnnxConversionTest() @@ -188,6 +196,54 @@ public void KmeansOnnxConversionTest() Done(); } + [Fact] + public void binaryClassificationTrainersOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + string dataPath = GetDataPath("breast-cancer.txt"); + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var dataView = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: true); + IEstimator[] estimators = { + mlContext.BinaryClassification.Trainers.SymbolicSgdLogisticRegression(), + mlContext.BinaryClassification.Trainers.SgdCalibrated(), + mlContext.BinaryClassification.Trainers.AveragedPerceptron(), + mlContext.BinaryClassification.Trainers.FastForest(), + mlContext.BinaryClassification.Trainers.LinearSvm(), + mlContext.BinaryClassification.Trainers.SdcaNonCalibrated(), + mlContext.BinaryClassification.Trainers.SgdNonCalibrated(), + mlContext.BinaryClassification.Trainers.FastTree(), + mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(), + mlContext.BinaryClassification.Trainers.LightGbm(), + mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(), + mlContext.BinaryClassification.Trainers.SgdCalibrated(), + mlContext.BinaryClassification.Trainers.SymbolicSgdLogisticRegression(), + }; + var initialPipeline = mlContext.Transforms.ReplaceMissingValues("Features"). + Append(mlContext.Transforms.NormalizeMinMax("Features")); + foreach (var estimator in estimators) + { + var pipeline = initialPipeline.Append(estimator); + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + // Compare model scores produced by ML.NET and ONNX's runtime. + if (IsOnnxRuntimeSupported()) + { + var onnxFileName = $"{estimator.ToString()}.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4ScalarColumns(transformedData.Schema[5].Name, outputNames[3], transformedData, onnxResult, 3); + CompareSelectedScalarColumns(transformedData.Schema[4].Name, outputNames[2], transformedData, onnxResult); + } + } + Done(); + } private class DataPoint { [VectorType(3)] @@ -1081,7 +1137,8 @@ private void CreateDummyExamplesToMakeComplierHappy() var dummyExample = new BreastCancerFeatureVector() { Features = null }; var dummyExample1 = new BreastCancerCatFeatureExample() { Label = false, F1 = 0, F2 = "Amy" }; var dummyExample2 = new BreastCancerMulticlassExample() { Label = "Amy", Features = null }; - var dummyExample3 = new SmallSentimentExample() { Tokens = null }; + var dummyExample3 = new BreastCancerBinaryClassification() { Label = false, Features = null }; + var dummyExample4 = new SmallSentimentExample() { Tokens = null }; } private void CompareResults(string leftColumnName, string rightColumnName, IDataView left, IDataView right) @@ -1243,7 +1300,34 @@ private void CompareSelectedR4ScalarColumns(string leftColumnName, string rightC // Scalar such as R4 (float) is converted to [1, 1]-tensor in ONNX format for consitency of making batch prediction. Assert.Equal(1, actual.Length); - Assert.Equal(expected, actual.GetItemOrDefault(0), precision); + CompareNumbersWithTolerance(expected, actual.GetItemOrDefault(0), null, precision); + } + } + } + private void CompareSelectedScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + T expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + var actualVal = actual.GetItemOrDefault(0); + + Assert.Equal(1, actual.Length); + + if (typeof(T) == typeof(ReadOnlyMemory)) + Assert.Equal(expected.ToString(), actualVal.ToString()); + else + Assert.Equal(expected, actualVal); } } }