Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TokenizingByCharacters export to Onnx #4805

Merged
merged 6 commits into from
Feb 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace Microsoft.ML.Trainers
/// | Is normalization required? | Yes |
/// | Is caching required? | No |
/// | Required NuGet in addition to Microsoft.ML | None |
/// | Exportable to ONNX | No |
/// | Exportable to ONNX | Yes |
///
/// ### Training Algorithm Details
/// [Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_classifier)
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Transforms/Text/TextNormalizing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ private bool IsCombiningDiacritic(char ch)
/// | Does this estimator need to look at the data to train its parameters? | No |
/// | Input column data type | Scalar or Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType)|
/// | Output column data type | Scalar or variable-sized Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType)|
/// | Exportable to ONNX | No |
/// | Exportable to ONNX | Yes |
///
/// The resulting <xref:Microsoft.ML.Transforms.Text.TextNormalizingTransformer> creates a new column, named as specified
/// in the output column name parameters, and normalizes the textual input data by changing case, removing diacritical marks,
Expand Down
53 changes: 51 additions & 2 deletions src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Model.OnnxConverter;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms.Text;

Expand Down Expand Up @@ -184,7 +185,7 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Dat

private protected override IRowMapper MakeRowMapper(DataViewSchema schema) => new Mapper(this, schema);

private sealed class Mapper : OneToOneMapperBase
private sealed class Mapper : OneToOneMapperBase, ISaveAsOnnx
{
private readonly DataViewType _type;
private readonly TokenizingByCharactersTransformer _parent;
Expand All @@ -204,6 +205,54 @@ public Mapper(TokenizingByCharactersTransformer parent, DataViewSchema inputSche
_isSourceVector[i] = inputSchema[_parent.ColumnPairs[i].inputColumnName].Type is VectorDataViewType;
}

public bool CanSaveOnnx(OnnxContext ctx) => true;

public void SaveAsOnnx(OnnxContext ctx)
{
Host.CheckValue(ctx, nameof(ctx));
for (int iinfo = 0; iinfo < _isSourceVector.Length; ++iinfo)
{
string inputColumnName = _parent.ColumnPairs[iinfo].inputColumnName;
if (!ctx.ContainsColumn(inputColumnName))
continue;

string outputColumnName = _parent.ColumnPairs[iinfo].outputColumnName;
string srcVariableName = ctx.GetVariableName(inputColumnName);
string dstVariableName = ctx.AddIntermediateVariable(_type, outputColumnName, true);
SaveAsOnnxCore(ctx, srcVariableName, dstVariableName);
}
}

private void SaveAsOnnxCore(OnnxContext ctx, string srcVariableName, string dstVariableName)
{
string opType = "Tokenizer";
string tokenizerOutput = ctx.AddIntermediateVariable(null, "TokenizerOutput", true);
var node = ctx.CreateNode(opType, srcVariableName, tokenizerOutput, ctx.GetNodeName(opType), "com.microsoft");
node.AddAttribute("mark", _parent._useMarkerChars);
node.AddAttribute("mincharnum", 1);
node.AddAttribute("pad_value", "");
node.AddAttribute("separators", new string[] { "" });

opType = "Squeeze";
var squeezeOutput = ctx.AddIntermediateVariable(null, "SqueezeOutput", true);
node = ctx.CreateNode(opType, tokenizerOutput, squeezeOutput, ctx.GetNodeName(opType), "");
node.AddAttribute("axes", new long[] { 0 });

opType = "LabelEncoder";
var labelEncoderOutput = ctx.AddIntermediateVariable(null, "LabelEncoderOutput", true);
node = ctx.CreateNode(opType, squeezeOutput, labelEncoderOutput, ctx.GetNodeName(opType));

IEnumerable<string> charStrings = Enumerable.Range(0, 65535).Select(x => ((char)x).ToString());
IEnumerable<long> charValues = Enumerable.Range(0, 65535).Select(x => Convert.ToInt64(x)); ;
node.AddAttribute("keys_strings", charStrings);
node.AddAttribute("values_int64s", charValues);

opType = "Cast";
var castNode = ctx.CreateNode(opType, labelEncoderOutput, dstVariableName, ctx.GetNodeName(opType), "");
var t = InternalDataKindExtensions.ToInternalDataKind(DataKind.UInt16).ToType();
castNode.AddAttribute("to", t);
}

protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore()
{
var result = new DataViewSchema.DetachedColumn[_parent.ColumnPairs.Length];
Expand Down Expand Up @@ -558,7 +607,7 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(DataViewRow input, int iinfo)
/// | Does this estimator need to look at the data to train its parameters? | Yes |
/// | Input column data type | Scalar or Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
/// | Output column data type | Variable-sized vector of [key](xref:Microsoft.ML.Data.KeyDataViewType) type. |
/// | Exportable to ONNX | No |
/// | Exportable to ONNX | Yes |
///
/// The estimator tokenizes characters by splitting text into sequences of characters using a sliding window.
/// During training, the estimator builds a key-value pair dictionary with the encountered sequences of characters.
Expand Down
31 changes: 31 additions & 0 deletions test/Microsoft.ML.Tests/OnnxConversionTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -921,6 +921,37 @@ public void WordEmbeddingsTest()
Done();
}

[Theory]
[CombinatorialData]
public void TokenizingByCharactersOnnxConversionTest(bool useMarkerCharacters)
{
var mlContext = new MLContext(seed: 1);
var dataPath = GetDataPath("wikipedia-detox-250-line-test.tsv");
var dataView = ML.Data.LoadFromTextFile(dataPath, new[] {
Lynx1820 marked this conversation as resolved.
Show resolved Hide resolved
new TextLoader.Column("label", DataKind.Boolean, 0),
new TextLoader.Column("text", DataKind.String, 1)
}, hasHeader: true);
var pipeline = new TokenizingByCharactersEstimator(mlContext, useMarkerCharacters: useMarkerCharacters, columns: new[] { ("TokenizedText", "text") });
var model = pipeline.Fit(dataView);
Lynx1820 marked this conversation as resolved.
Show resolved Hide resolved
var transformedData = model.Transform(dataView);
var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView);
// Compare model scores produced by ML.NET and ONNX's runtime.
if (IsOnnxRuntimeSupported())
{
var onnxFileName = $"TokenizingByCharacters.onnx";
var onnxModelPath = GetOutputPath(onnxFileName);
SaveOnnxModel(onnxModel, onnxModelPath, null);
// Evaluate the saved ONNX model using the data used to train the ML.NET pipeline.
string[] inputNames = onnxModel.Graph.Input.Select(valueInfoProto => valueInfoProto.Name).ToArray();
string[] outputNames = onnxModel.Graph.Output.Select(valueInfoProto => valueInfoProto.Name).ToArray();
var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(outputNames, inputNames, onnxModelPath);
var onnxTransformer = onnxEstimator.Fit(dataView);
var onnxResult = onnxTransformer.Transform(dataView);
CompareSelectedVectorColumns<UInt16>(transformedData.Schema[2].Name, outputNames[2], transformedData, onnxResult); //compare scores
}
Done();
}

[Theory]
// These are the supported conversions
// ML.NET does not allow any conversions between signed and unsigned numeric types
Expand Down