Adding sample for LightGbm ranking (#2729)

najeeb-kazmi · web-flow · commit ff6d16d10d69 · 2019-02-26T13:22:09.000-08:00
* Adding a sample for LightGbm Ranking

* PR feedback + cleaning up namespaces in Microsoft.ML.Samples project

* Adding a sample for LightGbm Ranking

* PR feedback + cleaning up namespaces in Microsoft.ML.Samples project

* nit

* Adding a sample for LightGbm Ranking

* PR feedback + cleaning up namespaces in Microsoft.ML.Samples project

* Adding a sample for LightGbm Ranking

* PR feedback + cleaning up namespaces in Microsoft.ML.Samples project

* nit

* Changing dataset to small sample and other feedback

* Renaming LightGbm sample filenames

* Feedback
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs
@@ -1,8 +1,8 @@
 ﻿using Microsoft.ML.Transforms.Categorical;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
 {
-    public class LightGbmBinaryClassification
+    public class LightGbm
     {
         // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
         public static void Example()
@@ -17,25 +17,25 @@ public static void Example()
             var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);
 
             // Create the Estimator.
-            var pipeline = mlContext.BinaryClassification.Trainers.LightGbm("IsOver50K", "Features");
+            var pipeline = mlContext.BinaryClassification.Trainers.LightGbm();
 
             // Fit this Pipeline to the Training Data.
             var model = pipeline.Fit(split.TrainSet);
 
             // Evaluate how the model is doing on the test data.
             var dataWithPredictions = model.Transform(split.TestSet);
 
-            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K");
+            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions);
             SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
 
-            // Output:
-            // Accuracy: 0.88
-            // AUC: 0.93
-            // F1 Score: 0.71
-            // Negative Precision: 0.90
-            // Negative Recall: 0.94
-            // Positive Precision: 0.76
-            // Positive Recall: 0.66
+            // Expected output:
+            //   Accuracy: 0.88
+            //   AUC: 0.93
+            //   F1 Score: 0.71
+            //   Negative Precision: 0.90
+            //   Negative Recall: 0.94
+            //   Positive Precision: 0.76
+            //   Positive Recall: 0.66
         }
     }
 }
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs
@@ -1,10 +1,9 @@
 ﻿using Microsoft.ML.LightGBM;
-using Microsoft.ML.Transforms.Categorical;
 using static Microsoft.ML.LightGBM.Options;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
 {
-    class LightGbmBinaryClassificationWithOptions
+    class LightGbmWithOptions
     {
         // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
         public static void Example()
@@ -22,8 +21,6 @@ public static void Example()
             var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(
                                 new Options
                                 {
-                                    LabelColumn = "IsOver50K",
-                                    FeatureColumn = "Features",
                                     Booster = new GossBooster.Options
                                     {
                                         TopRate = 0.3,
@@ -37,17 +34,17 @@ public static void Example()
             // Evaluate how the model is doing on the test data.
             var dataWithPredictions = model.Transform(split.TestSet);
 
-            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions, "IsOver50K");
+            var metrics = mlContext.BinaryClassification.Evaluate(dataWithPredictions);
             SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
 
-            // Output:
-            // Accuracy: 0.88
-            // AUC: 0.93
-            // F1 Score: 0.71
-            // Negative Precision: 0.90
-            // Negative Recall: 0.94
-            // Positive Precision: 0.76
-            // Positive Recall: 0.67
+            // Expected output:
+            //   Accuracy: 0.88
+            //   AUC: 0.93
+            //   F1 Score: 0.71
+            //   Negative Precision: 0.90
+            //   Negative Recall: 0.94
+            //   Positive Precision: 0.76
+            //   Positive Recall: 0.67
         }
     }
 }
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCALogisticRegression.cs
@@ -3,7 +3,7 @@
 using Microsoft.ML.Data;
 using Microsoft.ML.Trainers;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
 {
     public static class SDCALogisticRegression
     {
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SDCASupportVectorMachine.cs
@@ -2,7 +2,7 @@
 using System.Linq;
 using Microsoft.ML.Data;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
 {
     public static class SDCASupportVectorMachine
     {
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs
@@ -1,4 +1,4 @@
-﻿namespace Microsoft.ML.Samples.Dynamic
+﻿namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
 {
     public static class SymbolicStochasticGradientDescent
     {
@@ -24,15 +24,17 @@ public static void Example()
 
             // Evaluate how the model is doing on the test data.
             var dataWithPredictions = model.Transform(split.TestSet);
-            var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions, "IsOver50K");
+            var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions);
             SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
-            // Accuracy: 0.85
-            // AUC: 0.90
-            // F1 Score: 0.64
-            // Negative Precision: 0.88
-            // Negative Recall: 0.93
-            // Positive Precision: 0.72
-            // Positive Recall: 0.58
+            
+            // Expected output:
+            //   Accuracy: 0.85
+            //   AUC: 0.90
+            //   F1 Score: 0.64
+            //   Negative Precision: 0.88
+            //   Negative Recall: 0.93
+            //   Positive Precision: 0.72
+            //   Positive Recall: 0.58
         }
     }
 }
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs
@@ -1,4 +1,4 @@
-﻿namespace Microsoft.ML.Samples.Dynamic
+﻿namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification
 {
     public static class SymbolicStochasticGradientDescentWithOptions
     {
@@ -22,7 +22,6 @@ public static void Example()
             var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent(
                     new ML.Trainers.HalLearners.SymSgdClassificationTrainer.Options()
                     {
-                        LabelColumn = "IsOver50K",
                         LearningRate = 0.2f,
                         NumberOfIterations = 10,
                         NumberOfThreads = 1,
@@ -33,15 +32,17 @@ public static void Example()
 
             // Evaluate how the model is doing on the test data.
             var dataWithPredictions = model.Transform(split.TestSet);
-            var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions, "IsOver50K");
+            var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions);
             SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
-            // Accuracy: 0.84
-            // AUC: 0.88
-            // F1 Score: 0.60
-            // Negative Precision: 0.87
-            // Negative Recall: 0.93
-            // Positive Precision: 0.69
-            // Positive Recall: 0.53
+
+            // Expected output:
+            //   Accuracy: 0.84
+            //   AUC: 0.88
+            //   F1 Score: 0.60
+            //   Negative Precision: 0.87
+            //   Negative Recall: 0.93
+            //   Positive Precision: 0.69
+            //   Positive Recall: 0.53
         }
     }
 }
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs
@@ -3,9 +3,9 @@
 using Microsoft.ML.Data;
 using Microsoft.ML.SamplesUtils;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification
 {
-    class LightGbmMulticlassClassification
+    class LightGbm
     {
         // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
         public static void Example()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs
@@ -5,9 +5,9 @@
 using Microsoft.ML.SamplesUtils;
 using static Microsoft.ML.LightGBM.Options;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification
 {
-    class LightGbmMulticlassClassificationWithOptions
+    class LightGbmWithOptions
     {
         // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
         public static void Example()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs
@@ -0,0 +1,42 @@
+﻿using Microsoft.ML;
+
+namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking
+{
+    public class LightGbm
+    {
+        // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
+        public static void Example()
+        {
+            // Creating the ML.Net IHostEnvironment object, needed for the pipeline.
+            var mlContext = new MLContext();
+
+            // Download and featurize the dataset.
+            var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kDataset(mlContext);
+
+            // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
+            // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
+            // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose.
+            var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
+
+            // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
+            var pipeline = mlContext.Ranking.Trainers.LightGbm(
+                numLeaves: 4,
+                minDataPerLeaf: 10,
+                learningRate: 0.1,
+                numBoostRound: 2);
+
+            // Fit this Pipeline to the Training Data.
+            var model = pipeline.Fit(split.TrainSet);
+
+            // Evaluate how the model is doing on the test data.
+            var dataWithPredictions = model.Transform(split.TestSet);
+
+            var metrics = mlContext.Ranking.Evaluate(dataWithPredictions);
+            SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+
+            // Expected output:
+            //   DCG: @1:1.71, @2:3.88, @3:7.93
+            //   NDCG: @1:7.98, @2:12.14, @3:16.62
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs
@@ -0,0 +1,50 @@
+﻿using Microsoft.ML.LightGBM;
+using static Microsoft.ML.LightGBM.Options;
+
+namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking
+{
+    public class LightGbmWithOptions
+    {
+        // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
+        public static void Example()
+        {
+            // Creating the ML.Net IHostEnvironment object, needed for the pipeline.
+            var mlContext = new MLContext();
+
+            // Download and featurize the train and validation datasets.
+            var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedMslrWeb10kDataset(mlContext);
+
+            // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
+            // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
+            // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose.
+            var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
+
+            // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
+            var pipeline = mlContext.Ranking.Trainers.LightGbm(
+                new Options
+                {
+                    NumLeaves = 4,
+                    MinDataPerLeaf = 10,
+                    LearningRate = 0.1,
+                    NumBoostRound = 2,
+                    Booster = new TreeBooster.Options
+                    {
+                        FeatureFraction = 0.9
+                    }
+                });
+
+            // Fit this pipeline to the training Data.
+            var model = pipeline.Fit(split.TrainSet);
+
+            // Evaluate how the model is doing on the test data.
+            var dataWithPredictions = model.Transform(split.TestSet);
+
+            var metrics = mlContext.Ranking.Evaluate(dataWithPredictions);
+            SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+
+            // Expected output:
+            //   DCG: @1:1.71, @2:3.88, @3:7.93
+            //   NDCG: @1:7.98, @2:12.14, @3:16.62
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs
@@ -3,7 +3,7 @@
 using Microsoft.ML.Data;
 using static Microsoft.ML.SamplesUtils.DatasetUtils;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation
 {
     public static class MatrixFactorization
     {
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs
@@ -4,7 +4,7 @@
 using Microsoft.ML.Trainers;
 using static Microsoft.ML.SamplesUtils.DatasetUtils;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation
 {
     public static class MatrixFactorizationWithOptions
     {
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs
@@ -2,9 +2,9 @@
 using System.Linq;
 using Microsoft.ML.Data;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression
 {
-    class LightGbmRegression
+    class LightGbm
     {
         // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
         public static void Example()
@@ -54,12 +54,12 @@ public static void Example()
             var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName);
             SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
 
-            // Output
-            // L1: 4.97
-            // L2: 51.37
-            // LossFunction: 51.37
-            // RMS: 7.17
-            // RSquared: 0.08
+            // Expected output
+            //   L1: 4.97
+            //   L2: 51.37
+            //   LossFunction: 51.37
+            //   RMS: 7.17
+            //   RSquared: 0.08
         }
     }
 }
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs
@@ -4,9 +4,9 @@
 using Microsoft.ML.LightGBM;
 using static Microsoft.ML.LightGBM.Options;
 
-namespace Microsoft.ML.Samples.Dynamic
+namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression
 {
-    class LightGbmRegressionWithOptions
+    class LightGbmWithOptions
     {
         // This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.
         public static void Example()
@@ -64,12 +64,12 @@ public static void Example()
             var metrics = mlContext.Regression.Evaluate(dataWithPredictions, label: labelName);
             SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
 
-            // Output
-            // L1: 4.97
-            // L2: 51.37
-            // LossFunction: 51.37
-            // RMS: 7.17
-            // RSquared: 0.08
+            // Expected output
+            //   L1: 4.97
+            //   L2: 51.37
+            //   LossFunction: 51.37
+            //   RMS: 7.17
+            //   RSquared: 0.08
         }
     }
 }
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs
diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/RankingMetrics.cs
diff --git a/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs b/src/Microsoft.ML.SamplesUtils/ConsoleUtils.cs
diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`using Microsoft.ML.Data;`
`4`	`4`	`using Microsoft.ML.Trainers;`
`5`	`5`
`6`		`-namespace Microsoft.ML.Samples.Dynamic`
	`6`	`+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification`
`7`	`7`	`{`
`8`	`8`	`public static class SDCALogisticRegression`
`9`	`9`	`{`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`using System.Linq;`
`3`	`3`	`using Microsoft.ML.Data;`
`4`	`4`
`5`		`-namespace Microsoft.ML.Samples.Dynamic`
	`5`	`+namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification`
`6`	`6`	`{`
`7`	`7`	`public static class SDCASupportVectorMachine`
`8`	`8`	`{`
Original file line number	Diff line number	Diff line change
`@@ -3,9 +3,9 @@`
`3`	`3`	`using Microsoft.ML.Data;`
`4`	`4`	`using Microsoft.ML.SamplesUtils;`
`5`	`5`
`6`		`-namespace Microsoft.ML.Samples.Dynamic`
	`6`	`+namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification`
`7`	`7`	`{`
`8`		`- class LightGbmMulticlassClassification`
	`8`	`+ class LightGbm`
`9`	`9`	`{`
`10`	`10`	`// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.`
`11`	`11`	`public static void Example()`
Original file line number	Diff line number	Diff line change
`@@ -5,9 +5,9 @@`
`5`	`5`	`using Microsoft.ML.SamplesUtils;`
`6`	`6`	`using static Microsoft.ML.LightGBM.Options;`
`7`	`7`
`8`		`-namespace Microsoft.ML.Samples.Dynamic`
	`8`	`+namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification`
`9`	`9`	`{`
`10`		`- class LightGbmMulticlassClassificationWithOptions`
	`10`	`+ class LightGbmWithOptions`
`11`	`11`	`{`
`12`	`12`	`// This example requires installation of additional nuget package <a href="https://www.nuget.org/packages/Microsoft.ML.LightGBM/">Microsoft.ML.LightGBM</a>.`
`13`	`13`	`public static void Example()`