Skip to content

Commit a8915f4

Browse files
authored
Created sample for 'LatentDirichletAllocation' API. (#3191)
1 parent b8a70ac commit a8915f4

File tree

3 files changed

+75
-62
lines changed

3 files changed

+75
-62
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs

Lines changed: 0 additions & 61 deletions
This file was deleted.
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Data;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
public static class LatentDirichletAllocation
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Create a small dataset as an IEnumerable.
16+
var samples = new List<TextData>()
17+
{
18+
new TextData(){ Text = "ML.NET's LatentDirichletAllocation API computes topic models." },
19+
new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic models." },
20+
new TextData(){ Text = "I like to eat broccoli and bananas." },
21+
new TextData(){ Text = "I eat bananas for breakfast." },
22+
new TextData(){ Text = "This car is expensive compared to last week's price." },
23+
new TextData(){ Text = "This car was $X last week." },
24+
};
25+
26+
// Convert training data to IDataView.
27+
var dataview = mlContext.Data.LoadFromEnumerable(samples);
28+
29+
// A pipeline for featurizing the text/string using LatentDirichletAllocation API.
30+
// To be more accurate in computing the LDA features, the pipeline first normalizes text and removes stop words
31+
// before passing tokens (the individual words, lower cased, with common words removed) to LatentDirichletAllocation.
32+
var pipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text")
33+
.Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "NormalizedText"))
34+
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens"))
35+
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
36+
.Append(mlContext.Transforms.Text.ProduceNgrams("Tokens"))
37+
.Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "Tokens", numberOfTopics: 3));
38+
39+
// Fit to data.
40+
var transformer = pipeline.Fit(dataview);
41+
42+
// Create the prediction engine to get the LDA features extracted from the text.
43+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(transformer);
44+
45+
// Convert the sample text into LDA features and print it.
46+
PrintLdaFeatures(predictionEngine.Predict(samples[0]));
47+
PrintLdaFeatures(predictionEngine.Predict(samples[1]));
48+
49+
// Features obtained post-transformation.
50+
// For LatentDirichletAllocation, we had specified numTopic:3. Hence each prediction has been featurized as a vector of floats with length 3.
51+
52+
// Topic1 Topic2 Topic3
53+
// 0.6364 0.2727 0.0909
54+
// 0.5455 0.1818 0.2727
55+
}
56+
57+
private static void PrintLdaFeatures(TransformedTextData prediction)
58+
{
59+
for (int i = 0; i < prediction.Features.Length; i++)
60+
Console.Write($"{prediction.Features[i]:F4} ");
61+
Console.WriteLine();
62+
}
63+
64+
private class TextData
65+
{
66+
public string Text { get; set; }
67+
}
68+
69+
private class TransformedTextData : TextData
70+
{
71+
public float[] Features { get; set; }
72+
}
73+
}
74+
}

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ internal static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog
509509
/// <example>
510510
/// <format type="text/markdown">
511511
/// <![CDATA[
512-
/// [!code-csharp[LatentDirichletAllocation](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs)]
512+
/// [!code-csharp[LatentDirichletAllocation](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs)]
513513
/// ]]>
514514
/// </format>
515515
/// </example>

0 commit comments

Comments
 (0)