-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Adding training statistics for LR in the HAL learners package. #1392
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2d5fbf6
68dd4a6
e8fede2
f649287
06f9704
96127d0
3831f5d
c638cbd
dd9524e
e540d63
2752b60
fe29307
f0b4707
5386a8c
fb897ed
377b462
89301b4
46316ea
c8d060a
737d173
39ca55e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.Runtime.Internal.Utilities; | ||
using Microsoft.ML.Trainers.HalLearners; | ||
using System; | ||
|
||
namespace Microsoft.ML.Runtime.Learners | ||
{ | ||
using Mkl = OlsLinearRegressionTrainer.Mkl; | ||
|
||
public sealed class ComputeLRTrainingStdThroughHal : ComputeLRTrainingStd | ||
{ | ||
/// <summary> | ||
/// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation, | ||
/// p-value and z-Score. | ||
/// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration. | ||
/// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients. | ||
/// </summary> | ||
/// <param name="hessian"></param> | ||
/// <param name="weightIndices"></param> | ||
/// <param name="numSelectedParams"></param> | ||
/// <param name="currentWeightsCount"></param> | ||
/// <param name="ch">The <see cref="IChannel"/> used for messaging.</param> | ||
/// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param> | ||
public override VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight) | ||
{ | ||
Contracts.AssertValue(ch); | ||
Contracts.AssertValue(hessian, nameof(hessian)); | ||
Contracts.Assert(numSelectedParams > 0); | ||
Contracts.Assert(currentWeightsCount > 0); | ||
Contracts.Assert(l2Weight > 0); | ||
|
||
// Apply Cholesky Decomposition to find the inverse of the Hessian. | ||
Double[] invHessian = null; | ||
try | ||
{ | ||
// First, find the Cholesky decomposition LL' of the Hessian. | ||
Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, hessian); | ||
// Note that hessian is already modified at this point. It is no longer the original Hessian, | ||
// but instead represents the Cholesky decomposition L. | ||
// Also note that the following routine is supposed to consume the Cholesky decomposition L instead | ||
// of the original information matrix. | ||
Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, hessian); | ||
// At this point, hessian should contain the inverse of the original Hessian matrix. | ||
// Swap hessian with invHessian to avoid confusion in the following context. | ||
Utils.Swap(ref hessian, ref invHessian); | ||
Contracts.Assert(hessian == null); | ||
} | ||
catch (DllNotFoundException) | ||
{ | ||
throw ch.ExceptNotSupp("The MKL library (MklImports.dll) or one of its dependencies is missing."); | ||
} | ||
|
||
float[] stdErrorValues = new float[numSelectedParams]; | ||
stdErrorValues[0] = (float)Math.Sqrt(invHessian[0]); | ||
|
||
for (int i = 1; i < numSelectedParams; i++) | ||
{ | ||
// Initialize with inverse Hessian. | ||
stdErrorValues[i] = (float)invHessian[i * (i + 1) / 2 + i]; | ||
} | ||
|
||
if (l2Weight > 0) | ||
{ | ||
// Iterate through all entries of inverse Hessian to make adjustment to variance. | ||
// A discussion on ridge regularized LR coefficient covariance matrix can be found here: | ||
// http://www.aloki.hu/pdf/0402_171179.pdf (Equations 11 and 25) | ||
// http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf (Section "Significance testing in ridge logistic regression") | ||
int ioffset = 1; | ||
for (int iRow = 1; iRow < numSelectedParams; iRow++) | ||
{ | ||
for (int iCol = 0; iCol <= iRow; iCol++) | ||
{ | ||
var entry = (float)invHessian[ioffset++]; | ||
AdjustVariance(entry, iRow, iCol, l2Weight, stdErrorValues); | ||
} | ||
} | ||
|
||
Contracts.Assert(ioffset == invHessian.Length); | ||
} | ||
|
||
for (int i = 1; i < numSelectedParams; i++) | ||
stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]); | ||
|
||
// currentWeights vector size is Weights2 + the bias | ||
return new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues, weightIndices); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -4,6 +4,7 @@ | |||||||||
|
||||||||||
using System; | ||||||||||
using System.Collections.Generic; | ||||||||||
using MathNet.Numerics.LinearAlgebra; | ||||||||||
using Microsoft.ML.Core.Data; | ||||||||||
using Microsoft.ML.Runtime; | ||||||||||
using Microsoft.ML.Runtime.CommandLine; | ||||||||||
|
@@ -40,11 +41,27 @@ public sealed partial class LogisticRegression : LbfgsTrainerBase<LogisticRegres | |||||||||
|
||||||||||
public sealed class Arguments : ArgumentsBase | ||||||||||
{ | ||||||||||
/// <summary> | ||||||||||
/// If set to <value>true</value>training statistics will be generated at the end of training. | ||||||||||
/// If you have a large number of learned training parameters(more than 500), | ||||||||||
/// generating the training statistics might take a few seconds. | ||||||||||
/// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="ComputeLRTrainingStd"/> | ||||||||||
/// present in the Microsoft.ML.HalLearners package. That computes the statistics using hardware acceleration. | ||||||||||
/// </summary> | ||||||||||
[Argument(ArgumentType.AtMostOnce, HelpText = "Show statistics of training examples.", ShortName = "stat", SortOrder = 50)] | ||||||||||
public bool ShowTrainingStats = false; | ||||||||||
|
||||||||||
/// <summary> | ||||||||||
/// The instance of <see cref="ComputeLRTrainingStd"/> that computes the training statistics at the end of training. | ||||||||||
/// If you have a large number of learned training parameters(more than 500), | ||||||||||
/// generating the training statistics might take a few seconds. | ||||||||||
/// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="ComputeLRTrainingStd"/> | ||||||||||
/// present in the Microsoft.ML.HalLearners package. That computes the statistics using hardware acceleration. | ||||||||||
/// </summary> | ||||||||||
public ComputeLRTrainingStd StdComputer; | ||||||||||
} | ||||||||||
|
||||||||||
private Double _posWeight; | ||||||||||
private double _posWeight; | ||||||||||
private LinearModelStatistics _stats; | ||||||||||
|
||||||||||
/// <summary> | ||||||||||
|
@@ -78,6 +95,9 @@ public LogisticRegression(IHostEnvironment env, | |||||||||
|
||||||||||
_posWeight = 0; | ||||||||||
ShowTrainingStats = Args.ShowTrainingStats; | ||||||||||
|
||||||||||
if (ShowTrainingStats && Args.StdComputer == null) | ||||||||||
Args.StdComputer = new ComputeLRTrainingStdImpl(); | ||||||||||
} | ||||||||||
|
||||||||||
/// <summary> | ||||||||||
|
@@ -88,6 +108,9 @@ internal LogisticRegression(IHostEnvironment env, Arguments args) | |||||||||
{ | ||||||||||
_posWeight = 0; | ||||||||||
ShowTrainingStats = Args.ShowTrainingStats; | ||||||||||
|
||||||||||
if (ShowTrainingStats && Args.StdComputer == null) | ||||||||||
Args.StdComputer = new ComputeLRTrainingStdImpl(); | ||||||||||
} | ||||||||||
|
||||||||||
public override PredictionKind PredictionKind => PredictionKind.BinaryClassification; | ||||||||||
|
@@ -330,7 +353,13 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor. | |||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
_stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance); | ||||||||||
if (Args.StdComputer == null) | ||||||||||
_stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance); | ||||||||||
else | ||||||||||
{ | ||||||||||
var std = Args.StdComputer.ComputeStd(hessian, weightIndices, numParams, CurrentWeights.Length, ch, L2Weight); | ||||||||||
_stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, std); | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
protected override void ProcessPriorDistribution(float label, float weight) | ||||||||||
|
@@ -397,4 +426,125 @@ public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironm | |||||||||
() => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn)); | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
/// <summary> | ||||||||||
/// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation, | ||||||||||
/// p-value and z-Score. | ||||||||||
/// If you need fast calculations, use the <see cref="ComputeLRTrainingStd"/> implementation in the Microsoft.ML.HALLearners package, | ||||||||||
/// which makes use of hardware acceleration. | ||||||||||
/// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients. | ||||||||||
/// </summary> | ||||||||||
public abstract class ComputeLRTrainingStd | ||||||||||
{ | ||||||||||
/// <summary> | ||||||||||
/// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation, | ||||||||||
/// p-value and z-Score. | ||||||||||
/// If you need fast calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration. | ||||||||||
/// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients. | ||||||||||
/// </summary> | ||||||||||
public abstract VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int parametersCount, int currentWeightsCount, IChannel ch, float l2Weight); | ||||||||||
|
||||||||||
/// <summary> | ||||||||||
/// Adjust the variance for regularized cases. | ||||||||||
/// </summary> | ||||||||||
[BestFriend] | ||||||||||
internal void AdjustVariance(float inverseEntry, int iRow, int iCol, float l2Weight, float[] stdErrorValues2) | ||||||||||
{ | ||||||||||
var adjustment = l2Weight * inverseEntry * inverseEntry; | ||||||||||
stdErrorValues2[iRow] -= adjustment; | ||||||||||
|
||||||||||
if (0 < iCol && iCol < iRow) | ||||||||||
stdErrorValues2[iCol] -= adjustment; | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
/// <summary> | ||||||||||
/// Extends the <see cref="ComputeLRTrainingStd"/> implementing <see cref="ComputeLRTrainingStd.ComputeStd(double[], int[], int, int, IChannel, float)"/> making use of Math.Net numeric | ||||||||||
/// If you need faster calculations(have non-sparse weight vectors of more than 300 features), use the instance of ComputeLRTrainingStd from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration | ||||||||||
/// for those computations. | ||||||||||
/// </summary> | ||||||||||
public sealed class ComputeLRTrainingStdImpl : ComputeLRTrainingStd | ||||||||||
{ | ||||||||||
/// <summary> | ||||||||||
/// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation, | ||||||||||
/// p-value and z-Score. | ||||||||||
/// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration. | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't point out the concerns about the used formulation.
Suggested change
|
||||||||||
/// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients. | ||||||||||
/// </summary> | ||||||||||
/// <param name="hessian"></param> | ||||||||||
/// <param name="weightIndices"></param> | ||||||||||
/// <param name="numSelectedParams"></param> | ||||||||||
/// <param name="currentWeightsCount"></param> | ||||||||||
/// <param name="ch">The <see cref="IChannel"/> used for messaging.</param> | ||||||||||
/// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param> | ||||||||||
public override VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight) | ||||||||||
{ | ||||||||||
Contracts.AssertValue(ch); | ||||||||||
Contracts.AssertValue(hessian, nameof(hessian)); | ||||||||||
Contracts.Assert(numSelectedParams > 0); | ||||||||||
Contracts.Assert(currentWeightsCount > 0); | ||||||||||
Contracts.Assert(l2Weight > 0); | ||||||||||
|
||||||||||
double[,] matrixHessian = new double[numSelectedParams, numSelectedParams]; | ||||||||||
|
||||||||||
int hessianLength = 0; | ||||||||||
int dimension = numSelectedParams - 1; | ||||||||||
|
||||||||||
for (int row = dimension; row >= 0; row--) | ||||||||||
{ | ||||||||||
for (int col = 0; col <= dimension; col++) | ||||||||||
{ | ||||||||||
if ((row + col) <= dimension) | ||||||||||
{ | ||||||||||
if ((row + col) == dimension) | ||||||||||
{ | ||||||||||
matrixHessian[row, col] = hessian[hessianLength]; | ||||||||||
} | ||||||||||
else | ||||||||||
{ | ||||||||||
matrixHessian[row, col] = hessian[hessianLength]; | ||||||||||
matrixHessian[dimension - col, dimension - row] = hessian[hessianLength]; | ||||||||||
} | ||||||||||
hessianLength++; | ||||||||||
} | ||||||||||
else | ||||||||||
continue; | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
var h = Matrix<double>.Build.DenseOfArray(matrixHessian); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I must be missing something here, so please forgive my ignorance. What exactly are we using Math.NET for in this code? Are we just using it for the Matrix type, and for Inversing a Matrix? Or is there something else we are using that I'm not seeing? If it is just for the Matrix type and Inverse, I don't think a full dependency on Math.NET is necessary. #Pending There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not missing anything. Math.Net numeric is used just to calculate the inverse. We discussed the option to implement it ourselves, and it was deemed error prone to round-of errors and unnecessary to own that code. In reply to: 232706971 [](ancestors = 232706971) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, thanks for the information. |
||||||||||
var invers = h.Inverse(); | ||||||||||
|
||||||||||
float[] stdErrorValues = new float[numSelectedParams]; | ||||||||||
stdErrorValues[0] = (float)Math.Sqrt(invers[0, numSelectedParams - 1]); | ||||||||||
|
||||||||||
for (int i = 1; i < numSelectedParams; i++) | ||||||||||
{ | ||||||||||
// Initialize with inverse Hessian. | ||||||||||
// The diagonal of the inverse Hessian. | ||||||||||
stdErrorValues[i] = (float)invers[i, numSelectedParams - i - 1]; | ||||||||||
} | ||||||||||
|
||||||||||
if (l2Weight > 0) | ||||||||||
{ | ||||||||||
// Iterate through all entries of inverse Hessian to make adjustment to variance. | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe extract this whole chunk of code into an internal utility method so it doesn't need to be duplicated between here and in |
||||||||||
// A discussion on ridge regularized LR coefficient covariance matrix can be found here: | ||||||||||
// http://www.aloki.hu/pdf/0402_171179.pdf (Equations 11 and 25) | ||||||||||
// http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf (Section "Significance testing in ridge logistic regression") | ||||||||||
for (int iRow = 1; iRow < numSelectedParams; iRow++) | ||||||||||
{ | ||||||||||
for (int iCol = 0; iCol <= iRow; iCol++) | ||||||||||
{ | ||||||||||
float entry = (float)invers[iRow, numSelectedParams - iCol - 1]; | ||||||||||
AdjustVariance(entry, iRow, iCol, l2Weight, stdErrorValues); | ||||||||||
} | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
for (int i = 1; i < numSelectedParams; i++) | ||||||||||
stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]); | ||||||||||
|
||||||||||
return new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues, weightIndices); | ||||||||||
} | ||||||||||
} | ||||||||||
} |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note: you'll need to add this PackageReference to our NuGet package as well:
machinelearning/pkg/Microsoft.ML/Microsoft.ML.nupkgproj
Lines 11 to 17 in f222025