Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Data;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms.Text;

Expand Down Expand Up @@ -39,7 +40,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
outputColumnName, inputColumnName);

/// <summary>
/// Create a <see cref="TextFeaturizingEstimator"/>, which transforms a text column into featurized float array that represents normalized counts of n-grams and char-grams.
/// Create a <see cref="TextFeaturizingEstimator"/>, which transforms a text column into featurized vector of <see cref="System.Single"/> that represents normalized counts of n-grams and char-grams.
/// </summary>
/// <remarks>This transform can operate over several columns.</remarks>
/// <param name="catalog">The text-related transform's catalog.</param>
Expand All @@ -62,7 +63,8 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
TextFeaturizingEstimator.Options options,
params string[] inputColumnNames)
=> new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
outputColumnName, inputColumnNames, options);
outputColumnName, Utils.Size(inputColumnNames) == 0 ? new[] { outputColumnName } : inputColumnNames,
options);

/// <summary>
/// Create a <see cref="TokenizingByCharactersEstimator"/>, which tokenizes by splitting text into sequences of characters
Expand Down
90 changes: 90 additions & 0 deletions test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ private class TestClass
public float[] Features = null;
}

private class TestClass2
{
public string Features;
public string[] OutputTokens;
}

[Fact]
public void TextFeaturizerWithPredefinedStopWordRemoverTest()
{
Expand Down Expand Up @@ -80,6 +86,90 @@ public void TextFeaturizerWithWordFeatureExtractorTest()
Assert.Equal(expected, prediction.Features);
}

[Fact]
public void TextFeaturizerWithWordFeatureExtractorWithNullInputNamesTest()
{
var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
new TestClass2() { Features = "This is another example", OutputTokens=null } };
var dataView = ML.Data.LoadFromEnumerable(data);

var options = new TextFeaturizingEstimator.Options()
{
WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
CharFeatureExtractor = null,
Norm = TextFeaturizingEstimator.NormFunction.None,
OutputTokensColumnName = "OutputTokens"
};

var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, null);
dataView = pipeline.Fit(dataView).Transform(dataView);

VBuffer<float> features = default;
float[][] transformed = { null, null };

var expected = new float[][] {
new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
};

using (var cursor = dataView.GetRowCursor(dataView.Schema))
{
var i = 0;
while (cursor.MoveNext())
{
var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
featureGetter(ref features);
transformed[i] = features.DenseValues().ToArray();
i++;
}
}

Assert.Equal(expected[0], transformed[0]);
Assert.Equal(expected[1], transformed[1]);
}

[Fact]
public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames()
{
var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
new TestClass2() { Features = "This is another example", OutputTokens=null } };
var dataView = ML.Data.LoadFromEnumerable(data);

var options = new TextFeaturizingEstimator.Options()
{
WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
CharFeatureExtractor = null,
Norm = TextFeaturizingEstimator.NormFunction.None,
OutputTokensColumnName = "OutputTokens"
};

var pipeline = ML.Transforms.Text.FeaturizeText("Features", options);
dataView = pipeline.Fit(dataView).Transform(dataView);

VBuffer<float> features = default;
float[][] transformed = { null, null };

var expected = new float[][] {
new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
};

using (var cursor = dataView.GetRowCursor(dataView.Schema))
{
var i = 0;
while (cursor.MoveNext())
{
var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
featureGetter(ref features);
transformed[i] = features.DenseValues().ToArray();
i++;
}
}

Assert.Equal(expected[0], transformed[0]);
Assert.Equal(expected[1], transformed[1]);
}

[Fact]
public void TextFeaturizerWithCharFeatureExtractorTest()
{
Expand Down