|
| 1 | +using System; |
| 2 | +using System.Collections.Generic; |
| 3 | +using System.IO; |
| 4 | +using System.Text; |
| 5 | + |
| 6 | +namespace Microsoft.ML.Samples.Dynamic |
| 7 | +{ |
| 8 | + public static class ApplyCustomWordEmbedding |
| 9 | + { |
| 10 | + public static void Example() |
| 11 | + { |
| 12 | + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, |
| 13 | + // as well as the source of randomness. |
| 14 | + var mlContext = new MLContext(); |
| 15 | + |
| 16 | + // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as |
| 17 | + // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. |
| 18 | + // The empty list is only needed to pass input schema to the pipeline. |
| 19 | + var emptySamples = new List<TextData>(); |
| 20 | + |
| 21 | + // Convert sample list to an empty IDataView. |
| 22 | + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); |
| 23 | + |
| 24 | + // Write a custom 3-dimensional word embedding model with 4 words. |
| 25 | + // Each line follows '<word> <float> <float> <float>' pattern. |
| 26 | + // Lines that do not confirm to the pattern are ignored. |
| 27 | + var pathToCustomModel = @".\custommodel.txt"; |
| 28 | + using (StreamWriter file = new StreamWriter(pathToCustomModel, false)) |
| 29 | + { |
| 30 | + file.WriteLine("great 1.0 2.0 3.0"); |
| 31 | + file.WriteLine("product -1.0 -2.0 -3.0"); |
| 32 | + file.WriteLine("like -1 100.0 -100"); |
| 33 | + file.WriteLine("buy 0 0 20"); |
| 34 | + } |
| 35 | + |
| 36 | + // A pipeline for converting text into a 9-dimension word embedding vector using the custom word embedding model. |
| 37 | + // The 'ApplyWordEmbedding' computes the minimum, average and maximum values for each token's embedding vector. |
| 38 | + // Tokens in 'custommodel.txt' model are represented as 3-dimension vector. |
| 39 | + // Therefore, the output is of 9-dimension [min, avg, max]. |
| 40 | + // |
| 41 | + // The 'ApplyWordEmbedding' API requires vector of text as input. |
| 42 | + // The pipeline first normalizes and tokenizes text then applies word embedding transformation. |
| 43 | + var textPipeline = mlContext.Transforms.Text.NormalizeText("Text") |
| 44 | + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")) |
| 45 | + .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", pathToCustomModel, "Tokens")); |
| 46 | + |
| 47 | + // Fit to data. |
| 48 | + var textTransformer = textPipeline.Fit(emptyDataView); |
| 49 | + |
| 50 | + // Create the prediction engine to get the embedding vector from the input text/string. |
| 51 | + var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); |
| 52 | + |
| 53 | + // Call the prediction API to convert the text into embedding vector. |
| 54 | + var data = new TextData() { Text = "This is a great product. I would like to buy it again." }; |
| 55 | + var prediction = predictionEngine.Predict(data); |
| 56 | + |
| 57 | + // Print the length of the embedding vector. |
| 58 | + Console.WriteLine($"Number of Features: {prediction.Features.Length}"); |
| 59 | + |
| 60 | + // Print the embedding vector. |
| 61 | + Console.Write("Features: "); |
| 62 | + foreach (var f in prediction.Features) |
| 63 | + Console.Write($"{f:F4} "); |
| 64 | + |
| 65 | + // Expected output: |
| 66 | + // Number of Features: 9 |
| 67 | + // Features: -1.0000 0.0000 -100.0000 0.0000 34.0000 -25.6667 1.0000 100.0000 20.0000 |
| 68 | + } |
| 69 | + |
| 70 | + public class TextData |
| 71 | + { |
| 72 | + public string Text { get; set; } |
| 73 | + } |
| 74 | + |
| 75 | + public class TransformedTextData : TextData |
| 76 | + { |
| 77 | + public float[] Features { get; set; } |
| 78 | + } |
| 79 | + } |
| 80 | +} |
0 commit comments