Skip to content

Commit 558b026

Browse files
committed
Kmeans no sample utils
1 parent 37ed336 commit 558b026

File tree

6 files changed

+390
-71
lines changed

6 files changed

+390
-71
lines changed
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML;
5+
using Microsoft.ML.Data;
6+
<# if (TrainerOptions != null) { #>
7+
<#=OptionsInclude#>
8+
<# } #>
9+
10+
namespace Samples.Dynamic.Trainers.Clustering
11+
{
12+
public static class <#=ClassName#>
13+
{<#=Comments#>
14+
public static void Example()
15+
{
16+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
17+
// as a catalog of available operations and as the source of randomness.
18+
// Setting the seed to a fixed number in this example to make outputs deterministic.
19+
var mlContext = new MLContext(seed: 0);
20+
21+
// Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
22+
var dataPoints = GenerateRandomDataPoints(1000, <#=DataSeed#>);
23+
24+
// Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
25+
var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);
26+
27+
<# if (TrainerOptions == null) { #>
28+
// Define the trainer.
29+
var pipeline = mlContext.Clustering.Trainers.<#=Trainer#>(<#=InlineTrainerOptions#>);
30+
<# } else { #>
31+
// Define trainer options.
32+
var options = new <#=TrainerOptions#>;
33+
34+
// Define the trainer.
35+
var pipeline = mlContext.Clustering.Trainers.<#=Trainer#>(options);
36+
<# } #>
37+
38+
// Train the model.
39+
var model = pipeline.Fit(trainingData);
40+
41+
// Create testing data. Use different random seed to make it different from training data.
42+
var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed: 123));
43+
44+
// Run the model on test data set.
45+
var transformedTestData = model.Transform(testData);
46+
47+
// Convert IDataView object to a list.
48+
var predictions = mlContext.Data.CreateEnumerable<Prediction>(transformedTestData, reuseRowObject: false).ToList();
49+
50+
// Look at 5 predictions
51+
foreach (var p in predictions.Take(2))
52+
Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");
53+
foreach (var p in predictions.TakeLast(3))
54+
Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");
55+
56+
<#=ExpectedOutputPerInstance#>
57+
58+
// Evaluate the overall metrics
59+
var metrics = mlContext.Clustering.Evaluate(transformedTestData, "Label", "Score", "Features");
60+
Console.WriteLine($"Normalized Mutual Information: {metrics.NormalizedMutualInformation:F2}");
61+
Console.WriteLine($"Average Distance: {metrics.AverageDistance:F2}");
62+
Console.WriteLine($"Davies Bouldin Index: {metrics.DaviesBouldinIndex:F2}");
63+
64+
<#=ExpectedOutput#>
65+
66+
// Get cluster centroids and the number of clusters k from KMeansModelParameters.
67+
VBuffer<float>[] centroids = default;
68+
69+
var modelParams = model.Model;
70+
modelParams.GetClusterCentroids(ref centroids, out int k);
71+
Console.WriteLine($"The first 3 coordinates of the first centroid are: ({string.Join(", ", centroids[0].GetValues().ToArray().Take(3))})");
72+
Console.WriteLine($"The first 3 coordinates of the second centroid are: ({string.Join(", ", centroids[1].GetValues().ToArray().Take(3))})");
73+
74+
<#=ExpectedCentroidsOutput#>
75+
}
76+
77+
private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed = 0)
78+
{
79+
var random = new Random(seed);
80+
float randomFloat() => (float)random.NextDouble();
81+
for (int i = 0; i < count; i++)
82+
{
83+
int label = i < count / 2 ? 0 : 1;
84+
yield return new DataPoint
85+
{
86+
Label = (uint)label,
87+
// Create random features with two clusters.
88+
// The first half has feature values cetered around 0.6 the second half has values centered around 0.4.
89+
Features = Enumerable.Repeat(label, 50).Select(index => label == 0 ? randomFloat() + 0.1f : randomFloat() - 0.1f).ToArray()
90+
};
91+
}
92+
}
93+
94+
// Example with label and 50 feature values. A data set is a collection of such examples.
95+
private class DataPoint
96+
{
97+
[KeyType(2)]
98+
public uint Label { get; set; }
99+
100+
[VectorType(50)]
101+
public float[] Features { get; set; }
102+
}
103+
104+
// Class used to capture predictions.
105+
private class Prediction
106+
{
107+
// Original label.
108+
public uint Label { get; set; }
109+
// Predicted label from the trainer.
110+
public uint PredictedLabel { get; set; }
111+
}
112+
}
113+
}
Lines changed: 92 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,114 @@
11
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
24
using Microsoft.ML;
35
using Microsoft.ML.Data;
46

5-
namespace Samples.Dynamic
7+
namespace Samples.Dynamic.Trainers.Clustering
68
{
7-
public class KMeans
9+
public static class KMeans
810
{
911
public static void Example()
1012
{
11-
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12-
// as well as the source of randomness.
13-
var ml = new MLContext(seed: 1);
13+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
14+
// as a catalog of available operations and as the source of randomness.
15+
// Setting the seed to a fixed number in this example to make outputs deterministic.
16+
var mlContext = new MLContext(seed: 0);
1417

15-
// Get a small dataset as an IEnumerable and convert it to an IDataView.
16-
var data = Microsoft.ML.SamplesUtils.DatasetUtils.GetInfertData();
17-
var trainData = ml.Data.LoadFromEnumerable(data);
18+
// Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
19+
var dataPoints = GenerateRandomDataPoints(1000, 123);
1820

19-
// Preview of the data.
20-
//
21-
// Age Case Education Induced Parity PooledStratum RowNum ...
22-
// 26 1 0-5yrs 1 6 3 1 ...
23-
// 42 1 0-5yrs 1 1 1 2 ...
24-
// 39 1 0-5yrs 2 6 4 3 ...
25-
// 34 1 0-5yrs 2 4 2 4 ...
26-
// 35 1 6-11yrs 1 3 32 5 ...
21+
// Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
22+
var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);
23+
24+
// Define the trainer.
25+
var pipeline = mlContext.Clustering.Trainers.KMeans(numberOfClusters: 2);
26+
27+
// Train the model.
28+
var model = pipeline.Fit(trainingData);
29+
30+
// Create testing data. Use different random seed to make it different from training data.
31+
var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed: 123));
32+
33+
// Run the model on test data set.
34+
var transformedTestData = model.Transform(testData);
2735

28-
// A pipeline for concatenating the age, parity and induced columns together in the Features column and training a KMeans model on them.
29-
string outputColumnName = "Features";
30-
var pipeline = ml.Transforms.Concatenate(outputColumnName, new[] { "Age", "Parity", "Induced" })
31-
.Append(ml.Clustering.Trainers.KMeans(outputColumnName, numberOfClusters: 2));
36+
// Convert IDataView object to a list.
37+
var predictions = mlContext.Data.CreateEnumerable<Prediction>(transformedTestData, reuseRowObject: false).ToList();
3238

33-
var model = pipeline.Fit(trainData);
39+
// Look at 5 predictions
40+
foreach (var p in predictions.Take(2))
41+
Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");
42+
foreach (var p in predictions.TakeLast(3))
43+
Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");
44+
45+
// Expected output:
46+
// Label: 1, Prediction: 1
47+
// Label: 1, Prediction: 1
48+
// Label: 2, Prediction: 2
49+
// Label: 2, Prediction: 2
50+
// Label: 2, Prediction: 2
51+
52+
// Evaluate the overall metrics
53+
var metrics = mlContext.Clustering.Evaluate(transformedTestData, "Label", "Score", "Features");
54+
Console.WriteLine($"Normalized Mutual Information: {metrics.NormalizedMutualInformation:F2}");
55+
Console.WriteLine($"Average Distance: {metrics.AverageDistance:F2}");
56+
Console.WriteLine($"Davies Bouldin Index: {metrics.DaviesBouldinIndex:F2}");
57+
58+
// Expected output:
59+
// Normalized Mutual Information: 0.95
60+
// Average Distance: 4.17
61+
// Davies Bouldin Index: 2.87
3462

3563
// Get cluster centroids and the number of clusters k from KMeansModelParameters.
3664
VBuffer<float>[] centroids = default;
37-
int k;
3865

39-
var modelParams = model.LastTransformer.Model;
40-
modelParams.GetClusterCentroids(ref centroids, out k);
66+
var modelParams = model.Model;
67+
modelParams.GetClusterCentroids(ref centroids, out int k);
68+
Console.WriteLine($"The first 3 coordinates of the first centroid are: ({string.Join(", ", centroids[0].GetValues().ToArray().Take(3))})");
69+
Console.WriteLine($"The first 3 coordinates of the second centroid are: ({string.Join(", ", centroids[1].GetValues().ToArray().Take(3))})");
4170

42-
var centroid = centroids[0].GetValues();
43-
Console.WriteLine($"The coordinates of centroid 0 are: ({string.Join(", ", centroid.ToArray())})");
44-
45-
// Expected output similar to:
46-
// The coordinates of centroid 0 are: (26, 6, 1)
71+
// Expected output similar to:
72+
// The first 3 coordinates of the first centroid are: (0.6035213, 0.6017533, 0.5964218)
73+
// The first 3 coordinates of the second centroid are: (0.4031044, 0.4175443, 0.4082336)
4774
//
4875
// Note: use the advanced options constructor to set the number of threads to 1 for a deterministic behavior.
4976
}
77+
78+
private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed = 0)
79+
{
80+
var random = new Random(seed);
81+
float randomFloat() => (float)random.NextDouble();
82+
for (int i = 0; i < count; i++)
83+
{
84+
int label = i < count / 2 ? 0 : 1;
85+
yield return new DataPoint
86+
{
87+
Label = (uint)label,
88+
// Create random features with two clusters.
89+
// The first half has feature values cetered around 0.6 the second half has values centered around 0.4.
90+
Features = Enumerable.Repeat(label, 50).Select(index => label == 0 ? randomFloat() + 0.1f : randomFloat() - 0.1f).ToArray()
91+
};
92+
}
93+
}
94+
95+
// Example with label and 50 feature values. A data set is a collection of such examples.
96+
private class DataPoint
97+
{
98+
[KeyType(2)]
99+
public uint Label { get; set; }
100+
101+
[VectorType(50)]
102+
public float[] Features { get; set; }
103+
}
104+
105+
// Class used to capture predictions.
106+
private class Prediction
107+
{
108+
// Original label.
109+
public uint Label { get; set; }
110+
// Predicted label from the trainer.
111+
public uint PredictedLabel { get; set; }
112+
}
50113
}
51114
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<#@ include file="Clustering.ttinclude"#>
2+
<#+
3+
string ClassName = "KMeans";
4+
string Trainer = "KMeans";
5+
string TrainerOptions = null;
6+
string InlineTrainerOptions = "numberOfClusters: 2";
7+
int DataSeed = 123;
8+
9+
string OptionsInclude = "";
10+
string Comments = "";
11+
12+
string ExpectedOutputPerInstance = @"// Expected output:
13+
// Label: 1, Prediction: 1
14+
// Label: 1, Prediction: 1
15+
// Label: 2, Prediction: 2
16+
// Label: 2, Prediction: 2
17+
// Label: 2, Prediction: 2";
18+
19+
string ExpectedOutput = @"// Expected output:
20+
// Normalized Mutual Information: 0.95
21+
// Average Distance: 4.17
22+
// Davies Bouldin Index: 2.87";
23+
24+
string ExpectedCentroidsOutput = @"// Expected output similar to:
25+
// The first 3 coordinates of the first centroid are: (0.6035213, 0.6017533, 0.5964218)
26+
// The first 3 coordinates of the second centroid are: (0.4031044, 0.4175443, 0.4082336)
27+
//
28+
// Note: use the advanced options constructor to set the number of threads to 1 for a deterministic behavior.";
29+
#>

0 commit comments

Comments
 (0)