dotnet · sfilipi · Nov 13, 2018 · Oct 26, 2018 · Oct 30, 2018 · Oct 31, 2018
diff --git a/build/Dependencies.props b/build/Dependencies.props
@@ -9,6 +9,7 @@
     <SystemReflectionEmitLightweightPackageVersion>4.3.0</SystemReflectionEmitLightweightPackageVersion>
     <SystemThreadingTasksDataflowPackageVersion>4.8.0</SystemThreadingTasksDataflowPackageVersion>
     <SystemComponentModelCompositionVersion>4.5.0</SystemComponentModelCompositionVersion>
+	<MathNumericPackageVersion>4.6.0</MathNumericPackageVersion>
   </PropertyGroup>
 
   <!-- Other/Non-Core Product Dependencies -->

diff --git a/pkg/Microsoft.ML/Microsoft.ML.nupkgproj b/pkg/Microsoft.ML/Microsoft.ML.nupkgproj
@@ -8,6 +8,7 @@
   <ItemGroup>
     <ProjectReference Include="../Microsoft.ML.CpuMath/Microsoft.ML.CpuMath.nupkgproj" />
 
+    <PackageReference Include="MathNet.Numerics.Signed" Version="$(MathNumericPackageVersion)" />
     <PackageReference Include="Newtonsoft.Json" Version="$(NewtonsoftJsonPackageVersion)" />
     <PackageReference Include="System.Reflection.Emit.Lightweight" Version="$(SystemReflectionEmitLightweightPackageVersion)" />
     <PackageReference Include="System.Threading.Tasks.Dataflow" Version="$(SystemThreadingTasksDataflowPackageVersion)" />

diff --git a/src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs b/src/Microsoft.ML.HalLearners/ComputeLRTrainingStdThroughHal.cs
@@ -0,0 +1,92 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Runtime.Internal.Utilities;
+using Microsoft.ML.Trainers.HalLearners;
+using System;
+
+namespace Microsoft.ML.Runtime.Learners
+{
+    using Mkl = OlsLinearRegressionTrainer.Mkl;
+
+    public sealed class ComputeLRTrainingStdThroughHal : ComputeLRTrainingStd
+    {
+        /// <summary>
+        /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation,
+        /// p-value and z-Score.
+        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
+        /// </summary>
+        /// <param name="hessian"></param>
+        /// <param name="weightIndices"></param>
+        /// <param name="numSelectedParams"></param>
+        /// <param name="currentWeightsCount"></param>
+        /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
+        /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
+        public override VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
+        {
+            Contracts.AssertValue(ch);
+            Contracts.AssertValue(hessian, nameof(hessian));
+            Contracts.Assert(numSelectedParams > 0);
+            Contracts.Assert(currentWeightsCount > 0);
+            Contracts.Assert(l2Weight > 0);
+
+            // Apply Cholesky Decomposition to find the inverse of the Hessian.
+            Double[] invHessian = null;
+            try
+            {
+                // First, find the Cholesky decomposition LL' of the Hessian.
+                Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, hessian);
+                // Note that hessian is already modified at this point. It is no longer the original Hessian,
+                // but instead represents the Cholesky decomposition L.
+                // Also note that the following routine is supposed to consume the Cholesky decomposition L instead
+                // of the original information matrix.
+                Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, hessian);
+                // At this point, hessian should contain the inverse of the original Hessian matrix.
+                // Swap hessian with invHessian to avoid confusion in the following context.
+                Utils.Swap(ref hessian, ref invHessian);
+                Contracts.Assert(hessian == null);
+            }
+            catch (DllNotFoundException)
+            {
+                throw ch.ExceptNotSupp("The MKL library (MklImports.dll) or one of its dependencies is missing.");
+            }
+
+            float[] stdErrorValues = new float[numSelectedParams];
+            stdErrorValues[0] = (float)Math.Sqrt(invHessian[0]);
+
+            for (int i = 1; i < numSelectedParams; i++)
+            {
+                // Initialize with inverse Hessian.
+                stdErrorValues[i] = (float)invHessian[i * (i + 1) / 2 + i];
+            }
+
+            if (l2Weight > 0)
+            {
+                // Iterate through all entries of inverse Hessian to make adjustment to variance.
+                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
+                // http://www.aloki.hu/pdf/0402_171179.pdf (Equations 11 and 25)
+                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf (Section "Significance testing in ridge logistic regression")
+                int ioffset = 1;
+                for (int iRow = 1; iRow < numSelectedParams; iRow++)
+                {
+                    for (int iCol = 0; iCol <= iRow; iCol++)
+                    {
+                        var entry = (float)invHessian[ioffset++];
+                        AdjustVariance(entry, iRow, iCol, l2Weight, stdErrorValues);
+                    }
+                }
+
+                Contracts.Assert(ioffset == invHessian.Length);
+            }
+
+            for (int i = 1; i < numSelectedParams; i++)
+                stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
+
+            // currentWeights vector size is Weights2 + the bias
+            return new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues, weightIndices);
+        }
+    }
+}
diff --git a/src/Microsoft.ML.StandardLearners/AssemblyInfo.cs b/src/Microsoft.ML.StandardLearners/AssemblyInfo.cs
@@ -6,5 +6,6 @@
 using Microsoft.ML;
 
 [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Legacy" + PublicKey.Value)]
+[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.HalLearners" + PublicKey.Value)]
 
 [assembly: WantsToBeBestFriends]
diff --git a/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj b/src/Microsoft.ML.StandardLearners/Microsoft.ML.StandardLearners.csproj
@@ -1,11 +1,15 @@
-<Project Sdk="Microsoft.NET.Sdk">
+<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
     <TargetFramework>netstandard2.0</TargetFramework>
     <IncludeInPackage>Microsoft.ML</IncludeInPackage>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   </PropertyGroup>
 
+  <ItemGroup>
+    <PackageReference Include="MathNet.Numerics.Signed" Version="$(MathNumericPackageVersion)" />
 <PackageReference Include="Newtonsoft.Json" Version="$(NewtonsoftJsonPackageVersion)" /> 
 <PackageReference Include="System.Reflection.Emit.Lightweight" Version="$(SystemReflectionEmitLightweightPackageVersion)" /> 
 <PackageReference Include="System.Threading.Tasks.Dataflow" Version="$(SystemThreadingTasksDataflowPackageVersion)" /> 
 <PackageReference Include="System.CodeDom" Version="$(SystemCodeDomPackageVersion)" /> 
 <PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" /> 
 <PackageReference Include="System.Collections.Immutable" Version="$(SystemCollectionsImmutableVersion)" /> 
 <PackageReference Include="System.ComponentModel.Composition" Version="$(SystemComponentModelCompositionVersion)" /> 
 <PackageReference Include="Newtonsoft.Json" Version="$(NewtonsoftJsonPackageVersion)" /> 
 <PackageReference Include="System.Reflection.Emit.Lightweight" Version="$(SystemReflectionEmitLightweightPackageVersion)" /> 
 <PackageReference Include="System.Threading.Tasks.Dataflow" Version="$(SystemThreadingTasksDataflowPackageVersion)" /> 
 <PackageReference Include="System.CodeDom" Version="$(SystemCodeDomPackageVersion)" /> 
 <PackageReference Include="System.Memory" Version="$(SystemMemoryVersion)" /> 
 <PackageReference Include="System.Collections.Immutable" Version="$(SystemCollectionsImmutableVersion)" /> 
 <PackageReference Include="System.ComponentModel.Composition" Version="$(SystemComponentModelCompositionVersion)" /> 
+  </ItemGroup>
+
   <ItemGroup>
     <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" />
     <ProjectReference Include="..\Microsoft.ML.CpuMath\Microsoft.ML.CpuMath.csproj" />

diff --git a/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/LogisticRegression/LogisticRegression.cs
@@ -4,6 +4,7 @@
 
 using System;
 using System.Collections.Generic;
+using MathNet.Numerics.LinearAlgebra;
 using Microsoft.ML.Core.Data;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Runtime.CommandLine;
@@ -40,11 +41,27 @@ public sealed partial class LogisticRegression : LbfgsTrainerBase<LogisticRegres
 
         public sealed class Arguments : ArgumentsBase
         {
+            /// <summary>
+            /// If set to <value>true</value>training statistics will be generated at the end of training.
+            /// If you have a large number of learned training parameters(more than 500),
+            /// generating the training statistics might take a few seconds.
+            /// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="ComputeLRTrainingStd"/>
+            /// present in the Microsoft.ML.HalLearners package. That computes the statistics using hardware acceleration.
+            /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Show statistics of training examples.", ShortName = "stat", SortOrder = 50)]
             public bool ShowTrainingStats = false;
+
+            /// <summary>
+            /// The instance of <see cref="ComputeLRTrainingStd"/> that computes the training statistics at the end of training.
+            /// If you have a large number of learned training parameters(more than 500),
+            /// generating the training statistics might take a few seconds.
+            /// More than 1000 weights might take a few minutes. For those cases consider using the instance of <see cref="ComputeLRTrainingStd"/>
+            /// present in the Microsoft.ML.HalLearners package. That computes the statistics using hardware acceleration.
+            /// </summary>
+            public ComputeLRTrainingStd StdComputer;
         }
 
-        private Double _posWeight;
+        private double _posWeight;
         private LinearModelStatistics _stats;
 
         /// <summary>
@@ -78,6 +95,9 @@ public LogisticRegression(IHostEnvironment env,
 
             _posWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
+
+            if (ShowTrainingStats && Args.StdComputer == null)
+                Args.StdComputer = new ComputeLRTrainingStdImpl();
         }
 
         /// <summary>
@@ -88,6 +108,9 @@ internal LogisticRegression(IHostEnvironment env, Arguments args)
         {
             _posWeight = 0;
             ShowTrainingStats = Args.ShowTrainingStats;
+
+            if (ShowTrainingStats && Args.StdComputer == null)
+                Args.StdComputer = new ComputeLRTrainingStdImpl();
         }
 
         public override PredictionKind PredictionKind => PredictionKind.BinaryClassification;
@@ -330,7 +353,13 @@ protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.
                 }
             }
 
-            _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+            if (Args.StdComputer == null)
+                _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance);
+            else
+            {
+                var std = Args.StdComputer.ComputeStd(hessian, weightIndices, numParams, CurrentWeights.Length, ch, L2Weight);
+                _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, std);
+            }
         }
 
         protected override void ProcessPriorDistribution(float label, float weight)
@@ -397,4 +426,125 @@ public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironm
                 () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
         }
     }
+
+    /// <summary>
+    /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,
+    /// p-value and z-Score.
+    /// If you need fast calculations, use the <see cref="ComputeLRTrainingStd"/> implementation in the Microsoft.ML.HALLearners package,
+    /// which makes use of hardware acceleration.
+    /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
+    /// </summary>
+    public abstract class ComputeLRTrainingStd
+    {
+        /// <summary>
+        /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,
+        /// p-value and z-Score.
+        /// If you need fast calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
+        /// </summary>
+        public abstract VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int parametersCount, int currentWeightsCount, IChannel ch, float l2Weight);
+
+        /// <summary>
+        /// Adjust the variance for regularized cases.
+        /// </summary>
+        [BestFriend]
+        internal void AdjustVariance(float inverseEntry, int iRow, int iCol, float l2Weight, float[] stdErrorValues2)
+        {
+            var adjustment = l2Weight * inverseEntry * inverseEntry;
+            stdErrorValues2[iRow] -= adjustment;
+
+            if (0 < iCol && iCol < iRow)
+                stdErrorValues2[iCol] -= adjustment;
+        }
+    }
+
+    /// <summary>
+    /// Extends the <see cref="ComputeLRTrainingStd"/> implementing <see cref="ComputeLRTrainingStd.ComputeStd(double[], int[], int, int, IChannel, float)"/> making use of Math.Net numeric
+    /// If you need faster calculations(have non-sparse weight vectors of more than 300 features), use the instance of ComputeLRTrainingStd from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration
+    /// for those computations.
+    /// </summary>
+    public sealed class ComputeLRTrainingStdImpl : ComputeLRTrainingStd
+    {
+        /// <summary>
+        /// Computes the standard deviation matrix of each of the non-zero training weights, needed to calculate further the standard deviation,
+        /// p-value and z-Score.
+        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
-        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
+``` #Resolved
-        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration.
+        /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
+``` #Resolved
+        /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients.
+        /// </summary>
+        /// <param name="hessian"></param>
+        /// <param name="weightIndices"></param>
+        /// <param name="numSelectedParams"></param>
+        /// <param name="currentWeightsCount"></param>
+        /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param>
+        /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param>
+        public override VBuffer<float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight)
+        {
+            Contracts.AssertValue(ch);
+            Contracts.AssertValue(hessian, nameof(hessian));
+            Contracts.Assert(numSelectedParams > 0);
+            Contracts.Assert(currentWeightsCount > 0);
+            Contracts.Assert(l2Weight > 0);
+
+            double[,] matrixHessian = new double[numSelectedParams, numSelectedParams];
+
+            int hessianLength = 0;
+            int dimension = numSelectedParams - 1;
+
+            for (int row = dimension; row >= 0; row--)
+            {
+                for (int col = 0; col <= dimension; col++)
+                {
+                    if ((row + col) <= dimension)
+                    {
+                        if ((row + col) == dimension)
+                        {
+                            matrixHessian[row, col] = hessian[hessianLength];
+                        }
+                        else
+                        {
+                            matrixHessian[row, col] = hessian[hessianLength];
+                            matrixHessian[dimension - col, dimension - row] = hessian[hessianLength];
+                        }
+                        hessianLength++;
+                    }
+                    else
+                        continue;
+                }
+            }
+
+            var h = Matrix<double>.Build.DenseOfArray(matrixHessian);
+            var invers = h.Inverse();
+
+            float[] stdErrorValues = new float[numSelectedParams];
+            stdErrorValues[0] = (float)Math.Sqrt(invers[0, numSelectedParams - 1]);
+
+            for (int i = 1; i < numSelectedParams; i++)
+            {
+                // Initialize with inverse Hessian.
+                // The diagonal of the inverse Hessian.
+                stdErrorValues[i] = (float)invers[i, numSelectedParams - i - 1];
+            }
+
+            if (l2Weight > 0)
+            {
+                // Iterate through all entries of inverse Hessian to make adjustment to variance.
+                // A discussion on ridge regularized LR coefficient covariance matrix can be found here:
+                // http://www.aloki.hu/pdf/0402_171179.pdf (Equations 11 and 25)
+                // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf (Section "Significance testing in ridge logistic regression")
+                for (int iRow = 1; iRow < numSelectedParams; iRow++)
+                {
+                    for (int iCol = 0; iCol <= iRow; iCol++)
+                    {
+                        float entry = (float)invers[iRow, numSelectedParams - iCol - 1];
+                        AdjustVariance(entry, iRow, iCol, l2Weight, stdErrorValues);
+                    }
+                }
+            }
+
+            for (int i = 1; i < numSelectedParams; i++)
+                stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]);
+
+            return new VBuffer<float>(currentWeightsCount, numSelectedParams, stdErrorValues, weightIndices);
+        }
+    }
 }