From 81381e2714f9e5a2f1435a002946df29fc26790d Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Thu, 31 Oct 2019 15:24:23 -0700 Subject: [PATCH 1/3] draft regression test --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 255 +++++++++++++++++- 1 file changed, 241 insertions(+), 14 deletions(-) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 0a79681b0b7..a2fd9091f57 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -21,6 +21,7 @@ using Microsoft.ML.Trainers; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Onnx; +using Microsoft.ML.Transforms.Text; using Newtonsoft.Json; using Xunit; using Xunit.Abstractions; @@ -28,8 +29,44 @@ namespace Microsoft.ML.Tests { - public class OnnxConversionTest : BaseTestBaseline + +public class OnnxConversionTest : BaseTestBaseline { + + private static IEnumerable GenerateRandomDataPoints(int count, + int seed = 0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint2 + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 50).Select( + x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of + // such examples. + private class DataPoint2 + { + public float Label { get; set; } + [VectorType(50)] + public float[] Features { get; set; } + } + + // Class used to capture predictions. + private class Prediction + { + // Original label. + public float Label { get; set; } + // Predicted score from the trainer. + public float Score { get; set; } + } private class AdultData { [LoadColumn(0, 10), ColumnName("FeatureVector")] @@ -108,8 +145,7 @@ public void SimpleEndToEndOnnxConversionTest() private class BreastCancerFeatureVector { [LoadColumn(1, 9), VectorType(9)] - public float[] Features; - } + public float[] Features; } private class BreastCancerCatFeatureExample { @@ -187,7 +223,160 @@ public void KmeansOnnxConversionTest() Done(); } - private class DataPoint + [Fact] + public void WordEmbeddingEstimatorOnnxConversionTest() //can't find the class - maybe + { + // Step 1: Create and train a ML.NET pipeline. + var mlContext = new MLContext(seed: 1); + string dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename); + var data = new TextLoader(ML, + new TextLoader.Options() + { + Separator = "\t", + HasHeader = true, + Columns = new[] + { + new TextLoader.Column("Label", DataKind.Boolean, 0), + new TextLoader.Column("SentimentText", DataKind.String, 1) + } + }).Load(GetDataPath(dataPath)); + + IEstimator[] estimators = { }; + var textPipeline = mlContext.Transforms.Text.NormalizeText("SentimentText") + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", + "SentimentText")) + .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", + "Tokens", WordEmbeddingEstimator.PretrainedModelKind + .SentimentSpecificWordEmbedding)); + var model = textPipeline.Fit(data); + var transformedData = model.Transform(data); + + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + // Compare results produced by ML.NET and ONNX's runtime. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + var onnxFileName = "WordEmbeddingEstimator.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(data); + var onnxResult = onnxTransformer.Transform(data); + CompareSelectedR4VectorColumns("Score", "Score0", transformedData, onnxResult, 3); + } + Done(); + } + + [Fact] + // Conversion tests for regression + public void regressionOnnxConversionTest() + { + /* + var mlContext = new MLContext(seed: 1); + string dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); + + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var dataView = mlContext.Data.LoadFromTextFile(dataPath, + separatorChar: ';', + hasHeader: true); + IEstimator[] estimators = { + //mlContext.Regression.Trainers.Ols(new OlsTrainer.Options() { + // LabelColumnName = "Target", + // FeatureColumnName = "FeatureVector", + //}), + //mlContext.Regression.Trainers.OnlineGradientDescent(new OnlineGradientDescentTrainer.Options(){ + // LabelColumnName = "Target", + // FeatureColumnName = "FeatureVector", + //}), + //mlContext.Transforms.DetectAnomalyBySrCnn("Target","FeatureVector"), // needs separate data + mlContext.Regression.Trainers.FastForest("Target", "FeatureVector"), + //mlContext.Regression.Trainers.FastTree("Target", "FeatureVector"), + //mlContext.Regression.Trainers.FastTreeTweedie("Target", "FeatureVector"), + //mlContext.Regression.Trainers.LightGbm("Target","FeatureVector"), + //mlContext.Regression.Trainers.LbfgsPoissonRegression("Target", "FeatureVector"), + }; + */ + // Create a new context for ML.NET operations. It can be used for + // exception tracking and logging, as a catalog of available operations + // and as the source of randomness. Setting the seed to a fixed number + // in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(1000); + + // Convert the list of data points to an IDataView object, which is + // consumable by ML.NET API. + var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints); + + // Define the trainer. + var pipeline = mlContext.Regression.Trainers.FastTreeTweedie( + labelColumnName: nameof(DataPoint2.Label), + featureColumnName: nameof(DataPoint2.Features)); + + // Train the model. + var model = pipeline.Fit(trainingData); + + // Create testing data. Use different random seed to make it different + // from training data. + var data = mlContext.Data.LoadFromEnumerable( + GenerateRandomDataPoints(5, seed: 123)); + + // Run the model on test data set. + var transformedTestData = model.Transform(data); + // Convert IDataView object to a list. + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); + // Convert IDataView object to a list. + var predictions = mlContext.Data.CreateEnumerable( + transformedTestData, reuseRowObject: false).ToList(); + foreach (var p in predictions) + System.Diagnostics.Debug.WriteLine($"Label: {p.Label:F3}, Prediction: {p.Score:F3}"); + // Compare results produced by ML.NET and ONNX's runtime. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + var onnxFileName = "test.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(data); + var onnxResult = onnxTransformer.Transform(data); + CompareSelectedR4ScalarColumns("Label", "Score0", data, onnxResult, 3); + } + Done(); + /*var initialPipeline = mlContext.Transforms.NormalizeMinMax("FeatureVector"); + foreach (var estimator in estimators) + { + //var pipeline = initialPipeline.Append(estimator); + var pipeline = estimator; + + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + var onnxFileName = $"{estimator.ToString()}.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + // Compare model scores produced by ML.NET and ONNX's runtime. + if (IsOnnxRuntimeSupported()) + { + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); //switched to 2 vause + CompareSelectedR4ScalarColumns(transformedData.Schema[2].Name, outputNames[2], transformedData, onnxResult, 0); // compare score results + } + } */ + //Done(); + } + private class DataPoint { [VectorType(3)] public float[] Features { get; set; } @@ -380,8 +569,7 @@ public void LogisticRegressionOnnxConversionTest() var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var mlContext = new MLContext(seed: 1); var data = mlContext.Data.LoadFromTextFile(trainDataPath, - separatorChar: ';' -, + separatorChar: ';', hasHeader: true); var cachedTrainData = mlContext.Data.Cache(data); var dynamicPipeline = @@ -658,15 +846,21 @@ public void WordEmbeddingsTest() var model = pipeline.Fit(data); var transformedData = model.Transform(data); - var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Transforms", "Sentiment"); - var onnxTextName = "SmallWordEmbed.txt"; - var onnxFileName = "SmallWordEmbed.onnx"; - var onnxTextPath = GetOutputPath(subDir, onnxTextName); - var onnxFilePath = GetOutputPath(subDir, onnxFileName); var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) + { + var onnxFileName = "WordEmbeddingEstimator.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); - CheckEquality(subDir, onnxTextName, parseOption: NumberParseOption.UseSingle); + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(data); + var onnxResult = onnxTransformer.Transform(data); + CompareSelectedR4VectorColumns("Embed", "Embed0", transformedData, onnxResult); + } Done(); } @@ -984,11 +1178,44 @@ private void CompareSelectedR4ScalarColumns(string leftColumnName, string rightC // Scalar such as R4 (float) is converted to [1, 1]-tensor in ONNX format for consitency of making batch prediction. Assert.Equal(1, actual.Length); - Assert.Equal(expected, actual.GetItemOrDefault(0), precision); + //Assert.Equal(expected, actual.GetItemOrDefault(0), precision); + //Output.WriteLine(actual.GetItemOrDefault(0)); + System.Diagnostics.Debug.WriteLine("Actual: " + actual.GetItemOrDefault(0)); + System.Diagnostics.Debug.WriteLine("Expected: " + expected); } } } + private void CompareSelectedScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + T expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + var actualVal = actual.GetItemOrDefault(0); + + Assert.Equal(1, actual.Length); + + if (typeof(T) == typeof(ReadOnlyMemory)) + Assert.Equal(expected.ToString(), actualVal.ToString()); + else + Assert.Equal(expected, actualVal); + } + } + } + + + private void SaveOnnxModel(ModelProto model, string binaryFormatPath, string textFormatPath) { DeleteOutputPath(binaryFormatPath); // Clean if such a file exists. From 8100364471a41b79186b18d1f29fab1c735b4bb0 Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Thu, 31 Oct 2019 15:26:43 -0700 Subject: [PATCH 2/3] Revert "draft regression test" This reverts commit 1ad45c995516b9d39fc05aca855ce2abe96c407b. --- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 255 +----------------- 1 file changed, 14 insertions(+), 241 deletions(-) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index a2fd9091f57..0a79681b0b7 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -21,7 +21,6 @@ using Microsoft.ML.Trainers; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Onnx; -using Microsoft.ML.Transforms.Text; using Newtonsoft.Json; using Xunit; using Xunit.Abstractions; @@ -29,44 +28,8 @@ namespace Microsoft.ML.Tests { - -public class OnnxConversionTest : BaseTestBaseline + public class OnnxConversionTest : BaseTestBaseline { - - private static IEnumerable GenerateRandomDataPoints(int count, - int seed = 0) - { - var random = new Random(seed); - for (int i = 0; i < count; i++) - { - float label = (float)random.NextDouble(); - yield return new DataPoint2 - { - Label = label, - // Create random features that are correlated with the label. - Features = Enumerable.Repeat(label, 50).Select( - x => x + (float)random.NextDouble()).ToArray() - }; - } - } - - // Example with label and 50 feature values. A data set is a collection of - // such examples. - private class DataPoint2 - { - public float Label { get; set; } - [VectorType(50)] - public float[] Features { get; set; } - } - - // Class used to capture predictions. - private class Prediction - { - // Original label. - public float Label { get; set; } - // Predicted score from the trainer. - public float Score { get; set; } - } private class AdultData { [LoadColumn(0, 10), ColumnName("FeatureVector")] @@ -145,7 +108,8 @@ public void SimpleEndToEndOnnxConversionTest() private class BreastCancerFeatureVector { [LoadColumn(1, 9), VectorType(9)] - public float[] Features; } + public float[] Features; + } private class BreastCancerCatFeatureExample { @@ -223,160 +187,7 @@ public void KmeansOnnxConversionTest() Done(); } - [Fact] - public void WordEmbeddingEstimatorOnnxConversionTest() //can't find the class - maybe - { - // Step 1: Create and train a ML.NET pipeline. - var mlContext = new MLContext(seed: 1); - string dataPath = GetDataPath(TestDatasets.Sentiment.trainFilename); - var data = new TextLoader(ML, - new TextLoader.Options() - { - Separator = "\t", - HasHeader = true, - Columns = new[] - { - new TextLoader.Column("Label", DataKind.Boolean, 0), - new TextLoader.Column("SentimentText", DataKind.String, 1) - } - }).Load(GetDataPath(dataPath)); - - IEstimator[] estimators = { }; - var textPipeline = mlContext.Transforms.Text.NormalizeText("SentimentText") - .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", - "SentimentText")) - .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", - "Tokens", WordEmbeddingEstimator.PretrainedModelKind - .SentimentSpecificWordEmbedding)); - var model = textPipeline.Fit(data); - var transformedData = model.Transform(data); - - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - // Compare results produced by ML.NET and ONNX's runtime. - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - var onnxFileName = "WordEmbeddingEstimator.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(data); - var onnxResult = onnxTransformer.Transform(data); - CompareSelectedR4VectorColumns("Score", "Score0", transformedData, onnxResult, 3); - } - Done(); - } - - [Fact] - // Conversion tests for regression - public void regressionOnnxConversionTest() - { - /* - var mlContext = new MLContext(seed: 1); - string dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); - - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var dataView = mlContext.Data.LoadFromTextFile(dataPath, - separatorChar: ';', - hasHeader: true); - IEstimator[] estimators = { - //mlContext.Regression.Trainers.Ols(new OlsTrainer.Options() { - // LabelColumnName = "Target", - // FeatureColumnName = "FeatureVector", - //}), - //mlContext.Regression.Trainers.OnlineGradientDescent(new OnlineGradientDescentTrainer.Options(){ - // LabelColumnName = "Target", - // FeatureColumnName = "FeatureVector", - //}), - //mlContext.Transforms.DetectAnomalyBySrCnn("Target","FeatureVector"), // needs separate data - mlContext.Regression.Trainers.FastForest("Target", "FeatureVector"), - //mlContext.Regression.Trainers.FastTree("Target", "FeatureVector"), - //mlContext.Regression.Trainers.FastTreeTweedie("Target", "FeatureVector"), - //mlContext.Regression.Trainers.LightGbm("Target","FeatureVector"), - //mlContext.Regression.Trainers.LbfgsPoissonRegression("Target", "FeatureVector"), - }; - */ - // Create a new context for ML.NET operations. It can be used for - // exception tracking and logging, as a catalog of available operations - // and as the source of randomness. Setting the seed to a fixed number - // in this example to make outputs deterministic. - var mlContext = new MLContext(seed: 0); - - // Create a list of training data points. - var dataPoints = GenerateRandomDataPoints(1000); - - // Convert the list of data points to an IDataView object, which is - // consumable by ML.NET API. - var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints); - - // Define the trainer. - var pipeline = mlContext.Regression.Trainers.FastTreeTweedie( - labelColumnName: nameof(DataPoint2.Label), - featureColumnName: nameof(DataPoint2.Features)); - - // Train the model. - var model = pipeline.Fit(trainingData); - - // Create testing data. Use different random seed to make it different - // from training data. - var data = mlContext.Data.LoadFromEnumerable( - GenerateRandomDataPoints(5, seed: 123)); - - // Run the model on test data set. - var transformedTestData = model.Transform(data); - // Convert IDataView object to a list. - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - // Convert IDataView object to a list. - var predictions = mlContext.Data.CreateEnumerable( - transformedTestData, reuseRowObject: false).ToList(); - foreach (var p in predictions) - System.Diagnostics.Debug.WriteLine($"Label: {p.Label:F3}, Prediction: {p.Score:F3}"); - // Compare results produced by ML.NET and ONNX's runtime. - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - var onnxFileName = "test.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(data); - var onnxResult = onnxTransformer.Transform(data); - CompareSelectedR4ScalarColumns("Label", "Score0", data, onnxResult, 3); - } - Done(); - /*var initialPipeline = mlContext.Transforms.NormalizeMinMax("FeatureVector"); - foreach (var estimator in estimators) - { - //var pipeline = initialPipeline.Append(estimator); - var pipeline = estimator; - - var model = pipeline.Fit(dataView); - var transformedData = model.Transform(dataView); - var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); - var onnxFileName = $"{estimator.ToString()}.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); - // Compare model scores produced by ML.NET and ONNX's runtime. - if (IsOnnxRuntimeSupported()) - { - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(dataView); - var onnxResult = onnxTransformer.Transform(dataView); //switched to 2 vause - CompareSelectedR4ScalarColumns(transformedData.Schema[2].Name, outputNames[2], transformedData, onnxResult, 0); // compare score results - } - } */ - //Done(); - } - private class DataPoint + private class DataPoint { [VectorType(3)] public float[] Features { get; set; } @@ -569,7 +380,8 @@ public void LogisticRegressionOnnxConversionTest() var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var mlContext = new MLContext(seed: 1); var data = mlContext.Data.LoadFromTextFile(trainDataPath, - separatorChar: ';', + separatorChar: ';' +, hasHeader: true); var cachedTrainData = mlContext.Data.Cache(data); var dynamicPipeline = @@ -846,21 +658,15 @@ public void WordEmbeddingsTest() var model = pipeline.Fit(data); var transformedData = model.Transform(data); + var subDir = Path.Combine("..", "..", "BaselineOutput", "Common", "Onnx", "Transforms", "Sentiment"); + var onnxTextName = "SmallWordEmbed.txt"; + var onnxFileName = "SmallWordEmbed.onnx"; + var onnxTextPath = GetOutputPath(subDir, onnxTextName); + var onnxFilePath = GetOutputPath(subDir, onnxFileName); var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, data); - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && Environment.Is64BitProcess) - { - var onnxFileName = "WordEmbeddingEstimator.onnx"; - var onnxModelPath = GetOutputPath(onnxFileName); - SaveOnnxModel(onnxModel, onnxModelPath, null); + SaveOnnxModel(onnxModel, onnxFilePath, onnxTextPath); - // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. - string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); - var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); - var onnxTransformer = onnxEstimator.Fit(data); - var onnxResult = onnxTransformer.Transform(data); - CompareSelectedR4VectorColumns("Embed", "Embed0", transformedData, onnxResult); - } + CheckEquality(subDir, onnxTextName, parseOption: NumberParseOption.UseSingle); Done(); } @@ -1178,44 +984,11 @@ private void CompareSelectedR4ScalarColumns(string leftColumnName, string rightC // Scalar such as R4 (float) is converted to [1, 1]-tensor in ONNX format for consitency of making batch prediction. Assert.Equal(1, actual.Length); - //Assert.Equal(expected, actual.GetItemOrDefault(0), precision); - //Output.WriteLine(actual.GetItemOrDefault(0)); - System.Diagnostics.Debug.WriteLine("Actual: " + actual.GetItemOrDefault(0)); - System.Diagnostics.Debug.WriteLine("Expected: " + expected); + Assert.Equal(expected, actual.GetItemOrDefault(0), precision); } } } - private void CompareSelectedScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) - { - var leftColumn = left.Schema[leftColumnName]; - var rightColumn = right.Schema[rightColumnName]; - - using (var expectedCursor = left.GetRowCursor(leftColumn)) - using (var actualCursor = right.GetRowCursor(rightColumn)) - { - T expected = default; - VBuffer actual = default; - var expectedGetter = expectedCursor.GetGetter(leftColumn); - var actualGetter = actualCursor.GetGetter>(rightColumn); - while (expectedCursor.MoveNext() && actualCursor.MoveNext()) - { - expectedGetter(ref expected); - actualGetter(ref actual); - var actualVal = actual.GetItemOrDefault(0); - - Assert.Equal(1, actual.Length); - - if (typeof(T) == typeof(ReadOnlyMemory)) - Assert.Equal(expected.ToString(), actualVal.ToString()); - else - Assert.Equal(expected, actualVal); - } - } - } - - - private void SaveOnnxModel(ModelProto model, string binaryFormatPath, string textFormatPath) { DeleteOutputPath(binaryFormatPath); // Clean if such a file exists. From c96d6904333bd1f10f9e895713225f69879b9e54 Mon Sep 17 00:00:00 2001 From: Keren Fuentes Date: Fri, 8 Nov 2019 15:53:11 -0800 Subject: [PATCH 3/3] fix for binary classification trainers export to onnx --- .../Scorers/BinaryClassifierScorer.cs | 28 ++++-- .../Scorers/SchemaBindablePredictorWrapper.cs | 2 +- test/Microsoft.ML.Tests/OnnxConversionTest.cs | 89 ++++++++++++++++++- 3 files changed, 111 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Data/Scorers/BinaryClassifierScorer.cs b/src/Microsoft.ML.Data/Scorers/BinaryClassifierScorer.cs index 9ef81f90474..0d866066275 100644 --- a/src/Microsoft.ML.Data/Scorers/BinaryClassifierScorer.cs +++ b/src/Microsoft.ML.Data/Scorers/BinaryClassifierScorer.cs @@ -197,14 +197,32 @@ private protected override void SaveAsOnnxCore(OnnxContext ctx) for (int iinfo = 0; iinfo < Bindings.InfoCount; ++iinfo) outColumnNames[iinfo] = Bindings.GetColumnName(Bindings.MapIinfoToCol(iinfo)); - //Check if "Probability" column was generated by the base class, only then - //label can be predicted. + /* If the probability column was generated, then the classification threshold is set to 0.5. Otherwise, + the predicted label is based on the sign of the score. + REVIEW: Binarizer should always have at least two output columns? + */ + string opType = "Binarizer"; + var binarizerOutput = ctx.AddIntermediateVariable(null, "BinarizerOutput", true); + if (Bindings.InfoCount >= 3 && ctx.ContainsColumn(outColumnNames[2])) { - string opType = "Binarizer"; - var node = ctx.CreateNode(opType, new[] { ctx.GetVariableName(outColumnNames[2]) }, - new[] { ctx.GetVariableName(outColumnNames[0]) }, ctx.GetNodeName(opType)); + var node = ctx.CreateNode(opType, ctx.GetVariableName(outColumnNames[2]), binarizerOutput, ctx.GetNodeName(opType)); node.AddAttribute("threshold", 0.5); + + opType = "Cast"; + node = ctx.CreateNode(opType, binarizerOutput, ctx.GetVariableName(outColumnNames[0]), ctx.GetNodeName(opType), ""); + var t = InternalDataKindExtensions.ToInternalDataKind(DataKind.Boolean).ToType(); + node.AddAttribute("to", t); + } + else if (Bindings.InfoCount == 2) + { + var node = ctx.CreateNode(opType, ctx.GetVariableName(outColumnNames[1]), binarizerOutput, ctx.GetNodeName(opType)); + node.AddAttribute("threshold", 0.0); + + opType = "Cast"; + node = ctx.CreateNode(opType, binarizerOutput, ctx.GetVariableName(outColumnNames[0]), ctx.GetNodeName(opType), ""); + var t = InternalDataKindExtensions.ToInternalDataKind(DataKind.Boolean).ToType(); + node.AddAttribute("to", t); } } diff --git a/src/Microsoft.ML.Data/Scorers/SchemaBindablePredictorWrapper.cs b/src/Microsoft.ML.Data/Scorers/SchemaBindablePredictorWrapper.cs index b7f6c4da871..1367249c480 100644 --- a/src/Microsoft.ML.Data/Scorers/SchemaBindablePredictorWrapper.cs +++ b/src/Microsoft.ML.Data/Scorers/SchemaBindablePredictorWrapper.cs @@ -320,7 +320,7 @@ private protected override bool SaveAsOnnxCore(OnnxContext ctx, RoleMappedSchema if (!ctx.ContainsColumn(featName)) return false; Contracts.Assert(ctx.ContainsColumn(featName)); - return mapper.SaveAsOnnx(ctx, outputNames, ctx.GetVariableName(featName)); + return mapper.SaveAsOnnx(ctx, new[] { outputNames[1] }, ctx.GetVariableName(featName)); } private protected override ISchemaBoundMapper BindCore(IChannel ch, RoleMappedSchema schema) => diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 0a79681b0b7..3cd22a8e025 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -131,6 +131,14 @@ private class BreastCancerMulticlassExample [LoadColumn(2, 9), VectorType(8)] public float[] Features; } + private class BreastCancerBinaryClassification + { + [LoadColumn(0)] + public bool Label; + + [LoadColumn(2, 9), VectorType(8)] + public float[] Features; + } [LessThanNetCore30OrNotNetCoreFact("netcoreapp3.0 output differs from Baseline. Tracked by https://github.com/dotnet/machinelearning/issues/2087")] public void KmeansOnnxConversionTest() @@ -187,6 +195,55 @@ public void KmeansOnnxConversionTest() Done(); } + [Fact] + public void binaryClassificationTrainersOnnxConversionTest() + { + var mlContext = new MLContext(seed: 1); + string dataPath = GetDataPath("breast-cancer.txt"); + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var dataView = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: '\t', hasHeader: true); + IEstimator[] estimators = { + mlContext.BinaryClassification.Trainers.SymbolicSgdLogisticRegression(), + mlContext.BinaryClassification.Trainers.SgdCalibrated(), + mlContext.BinaryClassification.Trainers.AveragedPerceptron(), + mlContext.BinaryClassification.Trainers.FastForest(), + mlContext.BinaryClassification.Trainers.LinearSvm(), + mlContext.BinaryClassification.Trainers.SdcaNonCalibrated(), + mlContext.BinaryClassification.Trainers.SgdNonCalibrated(), + mlContext.BinaryClassification.Trainers.FastTree(), + mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(), + mlContext.BinaryClassification.Trainers.LightGbm(), + mlContext.BinaryClassification.Trainers.SdcaLogisticRegression(), + mlContext.BinaryClassification.Trainers.SgdCalibrated(), + mlContext.BinaryClassification.Trainers.SymbolicSgdLogisticRegression(), + }; + var initialPipeline = mlContext.Transforms.ReplaceMissingValues("Features"). + Append(mlContext.Transforms.NormalizeMinMax("Features")); + foreach (var estimator in estimators) + { + var pipeline = initialPipeline.Append(estimator); + var model = pipeline.Fit(dataView); + var transformedData = model.Transform(dataView); + var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); + // Compare model scores produced by ML.NET and ONNX's runtime. + if (IsOnnxRuntimeSupported()) + { + var onnxFileName = $"{estimator.ToString()}.onnx"; + var onnxModelPath = GetOutputPath(onnxFileName); + SaveOnnxModel(onnxModel, onnxModelPath, null); + // Evaluate the saved ONNX model using the data used to train the ML.NET pipeline. + string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray(); + var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath); + var onnxTransformer = onnxEstimator.Fit(dataView); + var onnxResult = onnxTransformer.Transform(dataView); + CompareSelectedR4ScalarColumns(transformedData.Schema[5].Name, outputNames[3], transformedData, onnxResult, 3); + CompareSelectedScalarColumns(transformedData.Schema[4].Name, outputNames[2], transformedData, onnxResult); + } + + } + Done(); + } private class DataPoint { [VectorType(3)] @@ -853,7 +910,8 @@ private void CreateDummyExamplesToMakeComplierHappy() var dummyExample = new BreastCancerFeatureVector() { Features = null }; var dummyExample1 = new BreastCancerCatFeatureExample() { Label = false, F1 = 0, F2 = "Amy" }; var dummyExample2 = new BreastCancerMulticlassExample() { Label = "Amy", Features = null }; - var dummyExample3 = new SmallSentimentExample() { Tokens = null }; + var dummyExample3 = new BreastCancerBinaryClassification() { Label = false, Features = null }; + var dummyExample4 = new SmallSentimentExample() { Tokens = null }; } private void CompareResults(string leftColumnName, string rightColumnName, IDataView left, IDataView right) @@ -984,7 +1042,34 @@ private void CompareSelectedR4ScalarColumns(string leftColumnName, string rightC // Scalar such as R4 (float) is converted to [1, 1]-tensor in ONNX format for consitency of making batch prediction. Assert.Equal(1, actual.Length); - Assert.Equal(expected, actual.GetItemOrDefault(0), precision); + CompareNumbersWithTolerance(expected, actual.GetItemOrDefault(0), null, precision); + } + } + } + private void CompareSelectedScalarColumns(string leftColumnName, string rightColumnName, IDataView left, IDataView right) + { + var leftColumn = left.Schema[leftColumnName]; + var rightColumn = right.Schema[rightColumnName]; + + using (var expectedCursor = left.GetRowCursor(leftColumn)) + using (var actualCursor = right.GetRowCursor(rightColumn)) + { + T expected = default; + VBuffer actual = default; + var expectedGetter = expectedCursor.GetGetter(leftColumn); + var actualGetter = actualCursor.GetGetter>(rightColumn); + while (expectedCursor.MoveNext() && actualCursor.MoveNext()) + { + expectedGetter(ref expected); + actualGetter(ref actual); + var actualVal = actual.GetItemOrDefault(0); + + Assert.Equal(1, actual.Length); + + if (typeof(T) == typeof(ReadOnlyMemory)) + Assert.Equal(expected.ToString(), actualVal.ToString()); + else + Assert.Equal(expected, actualVal); } } }