Skip to content

Commit 59d1a08

Browse files
committed
Lockdown of Microsoft.ML.LightGBM public surface.
1 parent 834e471 commit 59d1a08

File tree

8 files changed

+291
-14
lines changed

8 files changed

+291
-14
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
using System;
2+
using Microsoft.ML.Data;
3+
using Microsoft.ML.Transforms.Categorical;
4+
5+
namespace Microsoft.ML.Samples.Dynamic.LightGBM
6+
{
7+
public class LightGbmBinaryClassification
8+
{
9+
public static void LightGbmBinaryClassificationExample()
10+
{
11+
// Downloading a classification dataset from github.com/dotnet/machinelearning.
12+
// It will be stored in the same path as the executable
13+
string dataFilePath = SamplesUtils.DatasetUtils.DownloadAdultDataset();
14+
15+
// Data Preview
16+
// 1. Column: age (numeric)
17+
// 2. Column: workclass (text/categorical)
18+
// 3. Column: fnlwgt (numeric)
19+
// 4. Column: education (text/categorical)
20+
// 5. Column: education-num (numeric)
21+
// 6. Column: marital-status (text/categorical)
22+
// 7. Column: occupation (text/categorical)
23+
// 8. Column: relationship (text/categorical)
24+
// 9. Column: ethnicity (text/categorical)
25+
// 10. Column: sex (text/categorical)
26+
// 11. Column: capital-gain (numeric)
27+
// 12. Column: capital-loss (numeric)
28+
// 13. Column: hours-per-week (numeric)
29+
// 14. Column: native-country (text/categorical)
30+
// 15. Column: Column [Label]: IsOver50K (boolean)
31+
32+
// Creating the ML.Net IHostEnvironment object, needed for the pipeline
33+
var mlContext = new MLContext();
34+
35+
var reader = mlContext.Data.ReadFromTextFile(dataFilePath, new TextLoader.Arguments
36+
{
37+
Separators = new[] { ',' },
38+
HasHeader = true,
39+
Columns = new[]
40+
{
41+
new TextLoader.Column("age", DataKind.R4, 0),
42+
new TextLoader.Column("workclass", DataKind.Text, 1),
43+
new TextLoader.Column("fnlwgt", DataKind.R4, 2),
44+
new TextLoader.Column("education", DataKind.Text, 3),
45+
new TextLoader.Column("education-num", DataKind.R4, 4),
46+
new TextLoader.Column("marital-status", DataKind.Text, 5),
47+
new TextLoader.Column("occupation", DataKind.Text, 6),
48+
new TextLoader.Column("relationship", DataKind.Text, 7),
49+
new TextLoader.Column("ethnicity", DataKind.Text, 8),
50+
new TextLoader.Column("sex", DataKind.Text, 9),
51+
new TextLoader.Column("capital-gain", DataKind.R4, 10),
52+
new TextLoader.Column("capital-loss", DataKind.R4, 11),
53+
new TextLoader.Column("hours-per-week", DataKind.R4, 12),
54+
new TextLoader.Column("native-country", DataKind.Text, 13),
55+
new TextLoader.Column("Label", DataKind.Bool, 14)
56+
}
57+
});
58+
59+
// Read the data, and leave 10% out, so we can use them for testing
60+
var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(reader, testFraction: 0.1);
61+
62+
// Create the Estimator
63+
var pipeline = mlContext.Transforms.Categorical.OneHotEncoding(new OneHotEncodingEstimator.ColumnInfo[]
64+
{
65+
new OneHotEncodingEstimator.ColumnInfo("marital-status"),
66+
new OneHotEncodingEstimator.ColumnInfo("occupation"),
67+
new OneHotEncodingEstimator.ColumnInfo("relationship"),
68+
new OneHotEncodingEstimator.ColumnInfo("ethnicity"),
69+
new OneHotEncodingEstimator.ColumnInfo("sex"),
70+
new OneHotEncodingEstimator.ColumnInfo("native-country"),
71+
})
72+
.Append(mlContext.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("native-country", count: 10))
73+
.Append(mlContext.Transforms.Concatenate("Features",
74+
"age",
75+
"education-num",
76+
"marital-status",
77+
"relationship",
78+
"ethnicity",
79+
"sex",
80+
"hours-per-week",
81+
"native-country"))
82+
.Append(mlContext.Transforms.Normalize("Features"))
83+
.Append(mlContext.BinaryClassification.Trainers.LightGbm());
84+
85+
// Fit this Pipeline to the Training Data
86+
var model = pipeline.Fit(trainData);
87+
88+
// Evaluate how the model is doing on the test data
89+
var dataWithPredictions = model.Transform(testData);
90+
91+
var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions);
92+
93+
Console.WriteLine($"Accuracy: {metrics.Accuracy}"); // 0.84
94+
Console.WriteLine($"AUC: {metrics.Auc}"); // 0.88
95+
Console.WriteLine($"F1 Score: {metrics.F1Score}"); // 0.62
96+
97+
Console.WriteLine($"Negative Precision: {metrics.NegativePrecision}"); // 0.88
98+
Console.WriteLine($"Negative Recall: {metrics.NegativeRecall}"); // 0.91
99+
Console.WriteLine($"Positive Precision: {metrics.PositivePrecision}"); // 0.67
100+
Console.WriteLine($"Positive Recall: {metrics.PositiveRecall}"); // 0.58
101+
}
102+
}
103+
}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
using System;
2+
using System.Linq;
3+
using Microsoft.ML.Data;
4+
using Microsoft.ML.SamplesUtils;
5+
6+
namespace Microsoft.ML.Samples.Dynamic.LightGBM
7+
{
8+
class LightGbmMulticlassClassification
9+
{
10+
public static void LightGbmMulticlassClassificationExample()
11+
{
12+
// Create a general context for ML.NET operations. It can be used for exception tracking and logging,
13+
// as a catalog of available operations and as the source of randomness.
14+
var mlContext = new MLContext();
15+
16+
// Create in-memory examples as C# native class.
17+
var examples = DatasetUtils.GenerateRandomMulticlassClassificationExamples(1000);
18+
19+
// Convert native C# class to IDataView, a consumble format to ML.NET functions.
20+
var dataView = mlContext.Data.ReadFromEnumerable(examples);
21+
22+
// Create a pipeline.
23+
// - Convert the string labels into key types.
24+
// - Apply LightGbm multiclass trainer
25+
var pipeline = mlContext.Transforms.Conversion.MapValueToKey("LabelIndex", "Label")
26+
.Append(mlContext.MulticlassClassification.Trainers.LightGbm(labelColumn: "LabelIndex"))
27+
.Append(mlContext.Transforms.Conversion.MapValueToKey("PredictedLabelIndex", "PredictedLabel"))
28+
.Append(mlContext.Transforms.CopyColumns("Scores", "Score"));
29+
30+
// Split the static-typed data into training and test sets. Only training set is used in fitting
31+
// the created pipeline. Metrics are computed on the test.
32+
var (trainingData, testingData) = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);
33+
34+
// Train the model.
35+
var model = pipeline.Fit(trainingData);
36+
37+
// Do prediction on the test set.
38+
var dataWithPredictions = model.Transform(testingData);
39+
40+
// Evaluate the trained model is the test set.
41+
var metrics = mlContext.MulticlassClassification.Evaluate(dataWithPredictions, label: "LabelIndex");
42+
43+
// Check if metrics are resonable.
44+
Console.WriteLine("Macro accuracy: {0}, Micro accuracy: {1}.", 0.863482146891263, 0.86309523809523814);
45+
46+
// Convert prediction in ML.NET format to native C# class.
47+
var nativePredictions = mlContext.CreateEnumerable<DatasetUtils.MulticlassClassificationExample>(dataWithPredictions, false).ToList();
48+
49+
// Get schema object out of the prediction. It contains metadata such as the mapping from predicted label index
50+
// (e.g., 1) to its actual label (e.g., "AA"). The call to "AsDynamic" converts our statically-typed pipeline into
51+
// a dynamically-typed one only for extracting metadata. In the future, metadata in statically-typed pipeline should
52+
// be accessible without dynamically-typed things.
53+
var schema = dataWithPredictions.Schema;
54+
55+
// Retrieve the mapping from labels to label indexes.
56+
var labelBuffer = new VBuffer<ReadOnlyMemory<char>>();
57+
schema[nameof(DatasetUtils.MulticlassClassificationExample.PredictedLabelIndex)].Metadata.GetValue("KeyValues", ref labelBuffer);
58+
// nativeLabels is { "AA" , "BB", "CC", "DD" }
59+
var nativeLabels = labelBuffer.DenseValues().ToArray(); // nativeLabels[nativePrediction.PredictedLabelIndex - 1] is the original label indexed by nativePrediction.PredictedLabelIndex.
60+
61+
62+
// Show prediction result for the 3rd example.
63+
var nativePrediction = nativePredictions[2];
64+
// Console output:
65+
// Our predicted label to this example is "AA" with probability 0.922597349.
66+
Console.WriteLine("Our predicted label to this example is {0} with probability {1}",
67+
nativeLabels[(int)nativePrediction.PredictedLabelIndex - 1],
68+
nativePrediction.Scores[(int)nativePrediction.PredictedLabelIndex - 1]);
69+
70+
var expectedProbabilities = new float[] { 0.922597349f, 0.07508608f, 0.00221699756f, 9.95488E-05f };
71+
// Scores and nativeLabels are two parallel attributes; that is, Scores[i] is the probability of being nativeLabels[i].
72+
// Console output:
73+
// The probability of being class "AA" is 0.922597349.
74+
// The probability of being class "BB" is 0.07508608.
75+
// The probability of being class "CC" is 0.00221699756.
76+
// The probability of being class "DD" is 9.95488E-05.
77+
for (int i = 0; i < labelBuffer.Length; ++i)
78+
Console.WriteLine("The probability of being class {0} is {1}.", nativeLabels[i], nativePrediction.Scores[i]);
79+
}
80+
}
81+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
using System;
2+
using Microsoft.ML.Data;
3+
4+
namespace Microsoft.ML.Samples.Dynamic.LightGBM
5+
{
6+
class LightGbmRegression
7+
{
8+
public static void LightGbmRegressionExample()
9+
{
10+
// Downloading a regression dataset from github.com/dotnet/machinelearning
11+
// this will create a housing.txt file in the filsystem this code will run
12+
// you can open the file to see the data.
13+
string dataFile = SamplesUtils.DatasetUtils.DownloadHousingRegressionDataset();
14+
15+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
16+
// as well as the source of randomness.
17+
var mlContext = new MLContext();
18+
19+
// Creating a data reader, based on the format of the data
20+
// The data is tab separated with all numeric columns.
21+
// The first column being the label and rest are numeric features
22+
// Here only seven numeric columns are used as features
23+
var dataView = mlContext.Data.ReadFromTextFile(dataFile, new TextLoader.Arguments
24+
{
25+
Separators = new[] { '\t' },
26+
HasHeader = true,
27+
Columns = new[]
28+
{
29+
new TextLoader.Column("Label", DataKind.R4, 0),
30+
new TextLoader.Column("Features", DataKind.R4, 1, 6)
31+
}
32+
});
33+
34+
//////////////////// Data Preview ////////////////////
35+
// MedianHomeValue CrimesPerCapita PercentResidental PercentNonRetail CharlesRiver NitricOxides RoomsPerDwelling PercentPre40s
36+
// 24.00 0.00632 18.00 2.310 0 0.5380 6.5750 65.20
37+
// 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90
38+
// 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10
39+
40+
var (trainData, testData) = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1);
41+
42+
// Create the estimator, here we only need LightGbm trainer
43+
// as data is already processed in a form consumable by the trainer
44+
var pipeline = mlContext.Regression.Trainers.LightGbm(
45+
numLeaves: 4,
46+
minDataPerLeaf: 6,
47+
learningRate: 0.001);
48+
49+
// Fit this pipeline to the training data
50+
var model = pipeline.Fit(trainData);
51+
52+
// Check the weights that the model learned
53+
VBuffer<float> weights = default;
54+
model.Model.GetFeatureWeights(ref weights);
55+
56+
var weightsValues = weights.GetValues();
57+
Console.WriteLine($"weight 0 - {weightsValues[0]}"); // CrimesPerCapita (weight 0) = 0.1898361
58+
Console.WriteLine($"weight 1 - {weightsValues[5]}"); // RoomsPerDwelling (weight 1) = 1
59+
60+
// Evaluate how the model is doing on the test data
61+
var dataWithPredictions = model.Transform(testData);
62+
var metrics = mlContext.Regression.Evaluate(dataWithPredictions);
63+
64+
Console.WriteLine($"L1 - {metrics.L1}"); // 4.9669731
65+
Console.WriteLine($"L2 - {metrics.L2}"); // 51.37296
66+
Console.WriteLine($"LossFunction - {metrics.LossFn}"); // 51.37296
67+
Console.WriteLine($"RMS - {metrics.Rms}"); // 7.167493
68+
Console.WriteLine($"RSquared - {metrics.RSquared}"); // 0.079478
69+
}
70+
}
71+
}

src/Microsoft.ML.LightGBM/LightGbmArguments.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,13 @@
2424

2525
namespace Microsoft.ML.LightGBM
2626
{
27-
public delegate void SignatureLightGBMBooster();
27+
internal delegate void SignatureLightGBMBooster();
2828

2929
[TlcModule.ComponentKind("BoosterParameterFunction")]
3030
public interface ISupportBoosterParameterFactory : IComponentFactory<IBoosterParameter>
3131
{
3232
}
33+
3334
public interface IBoosterParameter
3435
{
3536
void UpdateParameters(Dictionary<string, object> res);

src/Microsoft.ML.LightGBM/LightGbmCatalog.cs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ public static class LightGbmExtensions
2424
/// <param name="numBoostRound">Number of iterations.</param>
2525
/// <param name="minDataPerLeaf">The minimal number of documents allowed in a leaf of the tree, out of the subsampled data.</param>
2626
/// <param name="learningRate">The learning rate.</param>
27+
/// <example>
28+
/// <format type="text/markdown">
29+
/// <![CDATA[
30+
/// [!code-csharp[ScoreTensorFlowModel](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/LightGBM/LightGBMRegression.cs)]
31+
/// ]]>
32+
/// </format>
33+
/// </example>
2734
public static LightGbmRegressorTrainer LightGbm(this RegressionCatalog.RegressionTrainers catalog,
2835
string labelColumn = DefaultColumnNames.Label,
2936
string featureColumn = DefaultColumnNames.Features,
@@ -62,6 +69,13 @@ public static LightGbmRegressorTrainer LightGbm(this RegressionCatalog.Regressio
6269
/// <param name="numBoostRound">Number of iterations.</param>
6370
/// <param name="minDataPerLeaf">The minimal number of documents allowed in a leaf of the tree, out of the subsampled data.</param>
6471
/// <param name="learningRate">The learning rate.</param>
72+
/// <example>
73+
/// <format type="text/markdown">
74+
/// <![CDATA[
75+
/// [!code-csharp[ScoreTensorFlowModel](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/LightGBM/LightGbmBinaryClassification.cs)]
76+
/// ]]>
77+
/// </format>
78+
/// </example>
6579
public static LightGbmBinaryTrainer LightGbm(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog,
6680
string labelColumn = DefaultColumnNames.Label,
6781
string featureColumn = DefaultColumnNames.Features,
@@ -140,6 +154,13 @@ public static LightGbmRankingTrainer LightGbm(this RankingCatalog.RankingTrainer
140154
/// <param name="numBoostRound">Number of iterations.</param>
141155
/// <param name="minDataPerLeaf">The minimal number of documents allowed in a leaf of the tree, out of the subsampled data.</param>
142156
/// <param name="learningRate">The learning rate.</param>
157+
/// <example>
158+
/// <format type="text/markdown">
159+
/// <![CDATA[
160+
/// [!code-csharp[ScoreTensorFlowModel](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/LightGBM/LightGBMMulticlassClassification.cs)]
161+
/// ]]>
162+
/// </format>
163+
/// </example>
143164
public static LightGbmMulticlassTrainer LightGbm(this MulticlassClassificationCatalog.MulticlassClassificationTrainers catalog,
144165
string labelColumn = DefaultColumnNames.Label,
145166
string featureColumn = DefaultColumnNames.Features,

src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ private static LightGbmRankingModelParameters Create(IHostEnvironment env, Model
7676
/// <include file='doc.xml' path='doc/members/member[@name="LightGBM"]/*' />
7777
public sealed class LightGbmRankingTrainer : LightGbmTrainerBase<float, RankingPredictionTransformer<LightGbmRankingModelParameters>, LightGbmRankingModelParameters>
7878
{
79-
public const string UserName = "LightGBM Ranking";
80-
public const string LoadNameValue = "LightGBMRanking";
81-
public const string ShortName = "LightGBMRank";
79+
internal const string UserName = "LightGBM Ranking";
80+
internal const string LoadNameValue = "LightGBMRanking";
81+
internal const string ShortName = "LightGBMRank";
8282

8383
public override PredictionKind PredictionKind => PredictionKind.Ranking;
8484

src/Microsoft.ML.LightGBM/Parallel/IParallel.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@ namespace Microsoft.ML.LightGBM
1212
/// <summary>
1313
/// Signature of LightGBM IAllreduce
1414
/// </summary>
15-
public delegate void SignatureParallelTrainer();
15+
internal delegate void SignatureParallelTrainer();
1616

1717
/// <summary>
1818
/// Reduce function define in LightGBM Cpp side
1919
/// </summary>
20-
public unsafe delegate void ReduceFunction(byte* src, byte* output, int typeSize, int arraySize);
20+
internal unsafe delegate void ReduceFunction(byte* src, byte* output, int typeSize, int arraySize);
2121

2222
/// <summary>
2323
/// Definition of ReduceScatter funtion

src/Microsoft.ML.LightGBM/Parallel/SingleTrainer.cs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,41 +13,41 @@
1313

1414
namespace Microsoft.ML.LightGBM
1515
{
16-
public sealed class SingleTrainer : IParallel
16+
internal sealed class SingleTrainer : IParallel
1717
{
18-
public AllgatherFunction GetAllgatherFunction()
18+
AllgatherFunction IParallel.GetAllgatherFunction()
1919
{
2020
return null;
2121
}
2222

23-
public ReduceScatterFunction GetReduceScatterFunction()
23+
ReduceScatterFunction IParallel.GetReduceScatterFunction()
2424
{
2525
return null;
2626
}
2727

28-
public int NumMachines()
28+
int IParallel.NumMachines()
2929
{
3030
return 1;
3131
}
3232

33-
public string ParallelType()
33+
string IParallel.ParallelType()
3434
{
3535
return "serial";
3636
}
3737

38-
public int Rank()
38+
int IParallel.Rank()
3939
{
4040
return 0;
4141
}
4242

43-
public Dictionary<string, string> AdditionalParams()
43+
Dictionary<string, string> IParallel.AdditionalParams()
4444
{
4545
return null;
4646
}
4747
}
4848

4949
[TlcModule.Component(Name = "Single", Desc = "Single node machine learning process.")]
50-
public sealed class SingleTrainerFactory : ISupportParallel
50+
internal sealed class SingleTrainerFactory : ISupportParallel
5151
{
5252
public IParallel CreateComponent(IHostEnvironment env) => new SingleTrainer();
5353
}

0 commit comments

Comments
 (0)