|
1 | 1 | using System;
|
| 2 | +using System.Collections.Generic; |
| 3 | +using System.Linq; |
2 | 4 | using Microsoft.ML;
|
3 | 5 | using Microsoft.ML.Data;
|
4 | 6 |
|
5 |
| -namespace Samples.Dynamic |
| 7 | +namespace Samples.Dynamic.Trainers.Clustering |
6 | 8 | {
|
7 |
| - public class KMeans |
| 9 | + public static class KMeans |
8 | 10 | {
|
9 | 11 | public static void Example()
|
10 | 12 | {
|
11 |
| - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, |
12 |
| - // as well as the source of randomness. |
13 |
| - var ml = new MLContext(seed: 1); |
| 13 | + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, |
| 14 | + // as a catalog of available operations and as the source of randomness. |
| 15 | + // Setting the seed to a fixed number in this example to make outputs deterministic. |
| 16 | + var mlContext = new MLContext(seed: 0); |
14 | 17 |
|
15 |
| - // Get a small dataset as an IEnumerable and convert it to an IDataView. |
16 |
| - var data = Microsoft.ML.SamplesUtils.DatasetUtils.GetInfertData(); |
17 |
| - var trainData = ml.Data.LoadFromEnumerable(data); |
| 18 | + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. |
| 19 | + var dataPoints = GenerateRandomDataPoints(1000, 123); |
18 | 20 |
|
19 |
| - // Preview of the data. |
20 |
| - // |
21 |
| - // Age Case Education Induced Parity PooledStratum RowNum ... |
22 |
| - // 26 1 0-5yrs 1 6 3 1 ... |
23 |
| - // 42 1 0-5yrs 1 1 1 2 ... |
24 |
| - // 39 1 0-5yrs 2 6 4 3 ... |
25 |
| - // 34 1 0-5yrs 2 4 2 4 ... |
26 |
| - // 35 1 6-11yrs 1 3 32 5 ... |
| 21 | + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. |
| 22 | + var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints); |
| 23 | + |
| 24 | + // Define the trainer. |
| 25 | + var pipeline = mlContext.Clustering.Trainers.KMeans(numberOfClusters: 2); |
| 26 | + |
| 27 | + // Train the model. |
| 28 | + var model = pipeline.Fit(trainingData); |
| 29 | + |
| 30 | + // Create testing data. Use different random seed to make it different from training data. |
| 31 | + var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed: 123)); |
| 32 | + |
| 33 | + // Run the model on test data set. |
| 34 | + var transformedTestData = model.Transform(testData); |
27 | 35 |
|
28 |
| - // A pipeline for concatenating the age, parity and induced columns together in the Features column and training a KMeans model on them. |
29 |
| - string outputColumnName = "Features"; |
30 |
| - var pipeline = ml.Transforms.Concatenate(outputColumnName, new[] { "Age", "Parity", "Induced" }) |
31 |
| - .Append(ml.Clustering.Trainers.KMeans(outputColumnName, numberOfClusters: 2)); |
| 36 | + // Convert IDataView object to a list. |
| 37 | + var predictions = mlContext.Data.CreateEnumerable<Prediction>(transformedTestData, reuseRowObject: false).ToList(); |
32 | 38 |
|
33 |
| - var model = pipeline.Fit(trainData); |
| 39 | + // Look at 5 predictions |
| 40 | + foreach (var p in predictions.Take(2)) |
| 41 | + Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}"); |
| 42 | + foreach (var p in predictions.TakeLast(3)) |
| 43 | + Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}"); |
| 44 | + |
| 45 | + // Expected output: |
| 46 | + // Label: 1, Prediction: 1 |
| 47 | + // Label: 1, Prediction: 1 |
| 48 | + // Label: 2, Prediction: 2 |
| 49 | + // Label: 2, Prediction: 2 |
| 50 | + // Label: 2, Prediction: 2 |
| 51 | + |
| 52 | + // Evaluate the overall metrics |
| 53 | + var metrics = mlContext.Clustering.Evaluate(transformedTestData, "Label", "Score", "Features"); |
| 54 | + Console.WriteLine($"Normalized Mutual Information: {metrics.NormalizedMutualInformation:F2}"); |
| 55 | + Console.WriteLine($"Average Distance: {metrics.AverageDistance:F2}"); |
| 56 | + Console.WriteLine($"Davies Bouldin Index: {metrics.DaviesBouldinIndex:F2}"); |
| 57 | + |
| 58 | + // Expected output: |
| 59 | + // Normalized Mutual Information: 0.95 |
| 60 | + // Average Distance: 4.17 |
| 61 | + // Davies Bouldin Index: 2.87 |
34 | 62 |
|
35 | 63 | // Get cluster centroids and the number of clusters k from KMeansModelParameters.
|
36 | 64 | VBuffer<float>[] centroids = default;
|
37 |
| - int k; |
38 | 65 |
|
39 |
| - var modelParams = model.LastTransformer.Model; |
40 |
| - modelParams.GetClusterCentroids(ref centroids, out k); |
| 66 | + var modelParams = model.Model; |
| 67 | + modelParams.GetClusterCentroids(ref centroids, out int k); |
| 68 | + Console.WriteLine($"The first 3 coordinates of the first centroid are: ({string.Join(", ", centroids[0].GetValues().ToArray().Take(3))})"); |
| 69 | + Console.WriteLine($"The first 3 coordinates of the second centroid are: ({string.Join(", ", centroids[1].GetValues().ToArray().Take(3))})"); |
41 | 70 |
|
42 |
| - var centroid = centroids[0].GetValues(); |
43 |
| - Console.WriteLine($"The coordinates of centroid 0 are: ({string.Join(", ", centroid.ToArray())})"); |
44 |
| - |
45 |
| - // Expected output similar to: |
46 |
| - // The coordinates of centroid 0 are: (26, 6, 1) |
| 71 | + // Expected output similar to: |
| 72 | + // The first 3 coordinates of the first centroid are: (0.6035213, 0.6017533, 0.5964218) |
| 73 | + // The first 3 coordinates of the second centroid are: (0.4031044, 0.4175443, 0.4082336) |
47 | 74 | //
|
48 | 75 | // Note: use the advanced options constructor to set the number of threads to 1 for a deterministic behavior.
|
49 | 76 | }
|
| 77 | + |
| 78 | + private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed = 0) |
| 79 | + { |
| 80 | + var random = new Random(seed); |
| 81 | + float randomFloat() => (float)random.NextDouble(); |
| 82 | + for (int i = 0; i < count; i++) |
| 83 | + { |
| 84 | + int label = i < count / 2 ? 0 : 1; |
| 85 | + yield return new DataPoint |
| 86 | + { |
| 87 | + Label = (uint)label, |
| 88 | + // Create random features with two clusters. |
| 89 | + // The first half has feature values cetered around 0.6 the second half has values centered around 0.4. |
| 90 | + Features = Enumerable.Repeat(label, 50).Select(index => label == 0 ? randomFloat() + 0.1f : randomFloat() - 0.1f).ToArray() |
| 91 | + }; |
| 92 | + } |
| 93 | + } |
| 94 | + |
| 95 | + // Example with label and 50 feature values. A data set is a collection of such examples. |
| 96 | + private class DataPoint |
| 97 | + { |
| 98 | + [KeyType(2)] |
| 99 | + public uint Label { get; set; } |
| 100 | + |
| 101 | + [VectorType(50)] |
| 102 | + public float[] Features { get; set; } |
| 103 | + } |
| 104 | + |
| 105 | + // Class used to capture predictions. |
| 106 | + private class Prediction |
| 107 | + { |
| 108 | + // Original label. |
| 109 | + public uint Label { get; set; } |
| 110 | + // Predicted label from the trainer. |
| 111 | + public uint PredictedLabel { get; set; } |
| 112 | + } |
50 | 113 | }
|
51 | 114 | }
|
0 commit comments