Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Generalised Linear Models to FSharpStats #334

Merged
merged 54 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
6288d55
Implement IRLS solver for GLMs
LibraChris Feb 8, 2024
425d12b
Rename variables
LibraChris May 8, 2024
d1c9c41
add qr based GLM
LibraChris May 10, 2024
d637be7
add inital tests for the glm
LibraChris May 10, 2024
d0fc5ee
Update glm QR Solver
LibraChris May 14, 2024
d24955a
Add new Test for GLMs using Gamma Distribution
LibraChris May 14, 2024
ddcf09c
Add tests for the Poisson linker functions
LibraChris May 14, 2024
60c3ec1
Add tests for the Gamma linker functions
LibraChris May 14, 2024
0f0661c
Rename testcases to Reflect their log function
LibraChris May 14, 2024
fe83ba6
Add tests for the LogitLinkFunction
LibraChris May 14, 2024
ab44068
Add tests for the InverseSquaredLinkFunction
LibraChris May 15, 2024
c00980e
Add tests by example for glm IrLS solver
LibraChris May 15, 2024
1e6a524
Add tests for the IdentityLinkFunction
LibraChris May 15, 2024
ac416bf
Add tests groudwork for the BinomialLinkFunction
LibraChris May 15, 2024
8f88c1e
Add tests for the variance of Binominal Family
LibraChris May 16, 2024
148a933
Add tests for the variance of Poisson Family
LibraChris May 16, 2024
a8b5f00
Add tests for the variance of Gaussian/Normal Family
LibraChris May 16, 2024
2cbef3c
Fix test implemetation for familyVarianceFunctions
LibraChris May 16, 2024
a73a07e
Add tests for the variance of Gamma Family
LibraChris May 16, 2024
4d03d46
Add tests for the variance of Inv.Gaussian Family
LibraChris May 16, 2024
4465115
Rename test Cases based on their DistributionFamily
LibraChris May 16, 2024
a6e6568
Fix LogitLinkFunction
LibraChris May 16, 2024
de1fcd7
remove redundant BinomialLinkFunction
LibraChris May 16, 2024
3554a02
Remove redundant LinkFunction
LibraChris May 16, 2024
c1f38f1
Fix InverseSquaredLinkFunction
LibraChris May 16, 2024
2a3b096
Updated Gamma Distribution Variance function
LibraChris May 16, 2024
3f5a349
add Deriv Functions
LibraChris May 18, 2024
2787fbd
add Tests for Link and deriv
LibraChris May 18, 2024
3ee33e3
fix various Linkfunctions
LibraChris May 18, 2024
43cea23
Rework GLM QR Solver
LibraChris May 22, 2024
3e83833
Modify tests
LibraChris May 22, 2024
2816155
Add tests prototype for QR-Stepwise iteration
LibraChris May 22, 2024
c5ced84
Fix QR based solver for GLMs
LibraChris May 22, 2024
5029c3a
Modify Variance tests
LibraChris May 22, 2024
c3dddcb
Update statistics
LibraChris May 28, 2024
a7c5c1b
Update GeneralisedLinearModel.fs
LibraChris May 28, 2024
d8877b7
Update GeneralisedLinearModel.fs
LibraChris May 29, 2024
3cd68a8
Update GeneralisedLinearModel.fs
LibraChris May 30, 2024
253ac91
Rework GLMStatistics
LibraChris May 31, 2024
19cad0f
Remove deprecated GLM.Irls
LibraChris May 31, 2024
1b3336f
Fix minor testing issue
LibraChris May 31, 2024
a1d0ee4
add getFamilyReisualDeviance for more families
LibraChris Jun 2, 2024
37d03e0
Write code comments and documentation
LibraChris Jun 5, 2024
5e9a1b6
add Documentation for GLM Usage
LibraChris Jun 7, 2024
72bfb83
Update formating for documentation
LibraChris Jun 10, 2024
a8a0004
added data for Documentation
LibraChris Jun 10, 2024
f694340
remote tests for binominal family variance
LibraChris Jun 10, 2024
8dcd8ab
Adress changes requested in #344
LibraChris Jun 12, 2024
170519e
Adress changes requested in #334
LibraChris Jun 18, 2024
ba5ae9c
Update xml comments
LibraChris Jun 19, 2024
6c3a235
fix building error
LibraChris Jul 3, 2024
2e80081
Fix Typo
LibraChris Jul 3, 2024
13b3de9
Fix indentations
LibraChris Aug 26, 2024
df24c3f
Updated XML documentation
LibraChris Oct 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Adress changes requested in #344
  • Loading branch information
LibraChris committed Jul 3, 2024
commit 8dcd8ab565b2a4ba53f0c6c718f62d7e4f5e1669
6 changes: 3 additions & 3 deletions docs/GeneralisedLinearModels.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ let mTol = 1e-6

// Fit the model
let glm =
FSharp.Stats.Fitting.GLM.QR.solveQrNewton A b maxIter distributionFamily mTol
FSharp.Stats.Fitting.GLM.SolveGLM.solveQR A b maxIter distributionFamily mTol

glm
(***include-value:glm***)
Expand All @@ -238,7 +238,7 @@ Using this map we can also access the zScore and Pearson scores of each of the p
*)

let glmPredictions =
FSharp.Stats.Fitting.GLM.QR.getGLMParameterStatistics A b glm ["Intercept"; "Acetic"; "H2S"; "Lactic"]
FSharp.Stats.Fitting.GLM.GLMStatistics.getGLMParameterStatistics A b glm ["Intercept"; "Acetic"; "H2S"; "Lactic"]
|> Map.ofSeq

(***include-value:glmPredictions***)
Expand Down Expand Up @@ -308,5 +308,5 @@ Pearson Chi-Square is another measure of goodness of fit. It assesses how well t
These statistics together give us a comprehensive view of the model's performance and its ability to explain the variability in the data.
*)

let glmStats = FSharp.Stats.Fitting.GLM.QR.getGLMModelStatistics b glm distributionFamily
let glmStats = FSharp.Stats.Fitting.GLM.GLMStatistics.getGLMStatisticsModel b glm distributionFamily
(***include-value:glmStats***)
90 changes: 90 additions & 0 deletions src/FSharp.Stats/Algebra/LinearAlgebra.fs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,96 @@ module LinearAlgebra =
// else LinearAlgebraManaged.QR a
LinearAlgebraManaged.QR a

/// Performs QR decomposition using an alternative algorithm.
/// Returns the orthogonal matrix Q and the upper triangular matrix R.
let qrAlternative (A: Matrix<float>) =
let m: int = A.NumRows
let n: int = A.NumCols

let q: Matrix<float> = Matrix.zero m n
let r: Matrix<float> = Matrix.zero n n
let qLengths: Vector<float> = Vector.zeroCreate n

let getVectorLength (v: Vector<float>) = Vector.fold (fun folder i -> folder+(i*i)) 0. v

let setqOfA (n: int) =
let aN: Vector<float> = Matrix.getCol A n
let qN =
if n = 0 then
aN
else
Array.init (n) (fun i ->
let denominator = qLengths[i]
let forNominator: Vector<float> = Matrix.getCol q i
let nominator: float = Vector.dot aN forNominator
r.[i, n] <- nominator
(nominator/denominator) * forNominator
)
|> Array.fold (fun folder e -> folder-e ) aN
Matrix.setCol q n qN
qN

for i=0 to n-1 do
let qN = setqOfA i
let qLength = getVectorLength qN
let rValue = sqrt(qLength)
r[i,i] <- rValue
qLengths[i] <- qLength
Comment on lines +264 to +268
Copy link
Member

@bvenn bvenn Jun 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I get whats happening here:

  1. qLengths is a zero-vector that gets filled iteratively for each i in line 262
  2. in line 256 setqOfA is called with the current i. However in l 247 qLength is called at position i which always should be 0, because it was initialized with 0. Just after the determination of qLength the value at this specific index is replaced in line 262.

Obviously I miss a step, but is there a possibility to create the qLength by Vector.init (...) rather than keeping it mutable and accessing/mutating it at multiple positions? If not, I'm happy to merge it, but I got confused during my attempt to understand whats happening here..

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you additionally set into -brackets a reference to this implementation and a short description what the benefit, drawback is when using this implementation over others.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not have a direct reference since I had a look at the mathematical explanation for QR via Gram Schmidt and tried implementing it myself. The only real upside using this method instead of the in FSharp.Stats established one is the difference in the output Dimensions in R. An actual reference would be the numpy implantation of the reduced qr https://numpy.org/doc/stable/reference/generated/numpy.linalg.qr.html .

I also think that this implemtation needs to be optimised in the future to perfect the GLM.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I get whats happening here:

  1. qLengths is a zero-vector that gets filled iteratively for each i in line 262
  2. in line 256 setqOfA is called with the current i. However in l 247 qLength is called at position i which always should be 0, because it was initialized with 0. Just after the determination of qLength the value at this specific index is replaced in line 262.

Obviously I miss a step, but is there a possibility to create the qLength by Vector.init (...) rather than keeping it mutable and accessing/mutating it at multiple positions? If not, I'm happy to merge it, but I got confused during my attempt to understand whats happening here..

The mutation was done to simplify the function to match the mathematical explanation better. This is one of the things that could be updated when I get to it.


for i=0 to n-1 do
let qN: Vector<float> = Matrix.getCol q i
let updateQ = (1./sqrt( qLengths[i] )) * qN
Matrix.setCol q i updateQ
for j=i+1 to n-1 do
let denominator = r[i, i]
let nominator = r[i, j]
r[i, j] <- (nominator/denominator)

q, r

/// Solves a linear system of equations using QR decomposition.
///
/// Parameters:
/// - A: The coefficient matrix of the linear system.
/// - t: The target vector of the linear system.
///
/// Returns:
/// - mX: The solution vector of the linear system.
/// - r: The upper triangular matrix obtained from QR decomposition.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the updates!
I'm going to merge this PR if you just could add proper XML docs like seen in:

/// <summary>
/// Calculates the slope of a regression line through the origin
/// </summary>
/// <param name="xData">vector of x values</param>
/// <param name="yData">vector of y values</param>
/// <returns>Slope of the regression line through the origin</returns>
/// <example>
/// <code>
/// // e.g. days since a certain event
/// let xData = vector [|0.;1.;2.;3.;4.;5.;|]
/// // some measured feature, that theoretically is zero at day 0
/// let yData = vector [|1.;5.;9.;13.;17.;18.;|]
///
/// // Estimate the slope of the regression line.
/// let coefficients =
/// LinearRegression.OLS.Linear.RTO.fit xData yData
/// </code>
/// </example>

The same for line 222 f.

let solveLinearQR (A: Matrix<float>) (t: Vector<float>) =
let m = A.NumRows
let n = A.NumCols

System.Diagnostics.Debug.Assert(m >= n)

let q,r = qrAlternative A

let QT = q.Transpose

let mX = Vector.zeroCreate n

let c: Vector<float> = QT * t

let rec build_mX_inner cross_prod i j =
if j=n then
cross_prod
else
let newCrossprod = cross_prod + (r[i, j] * mX[j])
build_mX_inner newCrossprod i (j+1)

let rec build_mX_outer i =
if i<0 then
()
else
let crossProd = build_mX_inner 0. i (i+1)
mX[i] <- (c[i] - crossProd) / r[i, i]
build_mX_outer (i-1)

build_mX_outer (n-1)

mX,r


///Returns the full Singular Value Decomposition of the input MxN matrix
///
///A : A = U * SIGMA * V**T in the tuple (S, U, V**T),
Expand Down
112 changes: 11 additions & 101 deletions src/FSharp.Stats/Fitting/GeneralisedLinearModel.fs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ namespace FSharp.Stats.Fitting.GLM

open System
open FSharp.Stats
open Algebra.LinearAlgebra

///
/// Represents the distribution families for Generalized Linear Models (GLMs).
Expand Down Expand Up @@ -410,95 +411,10 @@ module GLMStatistics =
}
)

module QR =

let internal qrAlternative (A:Matrix<float>) =
let m: int = A.NumRows
let n: int = A.NumCols

let q: Matrix<float> = Matrix.zero m n
let r: Matrix<float> = Matrix.zero n n
let qLengths: Vector<float> = Vector.zeroCreate n

let getVectorLength (v: Vector<float>) = Vector.fold (fun folder i -> folder+(i*i)) 0. v

let setqOfA (n: int) =
let aN: Vector<float> = Matrix.getCol A n
let qN =
if n = 0 then
aN
else
Array.init (n) (fun i ->
let denominator = qLengths[i]
let forNominator: Vector<float> = Matrix.getCol q i
let nominator: float = Vector.dot aN forNominator
r.[i, n] <- nominator
(nominator/denominator) * forNominator
)
|> Array.fold (fun folder e -> folder-e ) aN
Matrix.setCol q n qN
qN

for i=0 to n-1 do
let qN = setqOfA i
let qLength = getVectorLength qN
let rValue = sqrt(qLength)
r[i,i] <- rValue
qLengths[i] <- qLength

for i=0 to n-1 do
let qN: Vector<float> = Matrix.getCol q i
let updateQ = (1./sqrt( qLengths[i] )) * qN
Matrix.setCol q i updateQ
for j=i+1 to n-1 do
let denominator = r[i, i]
let nominator = r[i, j]
r[i, j] <- (nominator/denominator)

q,r

/// Solves a linear system of equations using QR decomposition.
///
/// Parameters:
/// - A: The coefficient matrix of the linear system.
/// - t: The target vector of the linear system.
///
/// Returns:
/// - mX: The solution vector of the linear system.
/// - r: The upper triangular matrix obtained from QR decomposition.
let internal solveLinearQR (A: Matrix<float>) (t: Vector<float>) =
let m = A.NumRows
let n = A.NumCols
module QRSolver =

System.Diagnostics.Debug.Assert(m >= n)

let q,r = qrAlternative A

let QT = q.Transpose

let mX = Vector.zeroCreate n

let c: Vector<float> = QT * t

let rec build_mX_inner cross_prod i j =
if j=n then
cross_prod
else
let newCrossprod = cross_prod + (r[i, j] * mX[j])
build_mX_inner newCrossprod i (j+1)

let rec build_mX_outer i =
if i<0 then
()
else
let crossProd = build_mX_inner 0. i (i+1)
mX[i] <- (c[i] - crossProd) / r[i, i]
build_mX_outer (i-1)

build_mX_outer (n-1)

mX,r

/// Performs a stepwise gain QR calculation for a generalised linear model.
/// This function calculates the cost, updated mean values, updated linear predictions,
/// weighted least squares results, and weighted least squares endogenous values for a given
Expand Down Expand Up @@ -645,24 +561,18 @@ module QR =

{mX=mX;mu=mu}

/// Calculates the model statistics for a solved generalized linear model.
module SolveGLM =

/// Solves a generalized linear model using the QR decomposition and Newton's method.
bvenn marked this conversation as resolved.
Show resolved Hide resolved
///
/// Parameters:
/// - A: The design matrix.
/// - b: The response vector.
/// - solvedGLM: The solved generalized linear model.
/// - maxIter: The maximum number of iterations.
/// - mDistributionFamily: The distribution family of the model.
/// - mTol: The tolerance for convergence.
///
/// Returns: The model statistics.
let getGLMModelStatistics (b:Vector<float>) (solvedGLM:GLMReturn) (mDistributionFamily:GlmDistributionFamily) =
GLMStatistics.getGLMStatisticsModel b solvedGLM mDistributionFamily
/// Calculates the parameter statistics for a solved generalized linear model.
/// Returns: The solved generalized linear model.
let solveQR (A: Matrix<float>) (b: Vector<float>) (maxIter: int) (mDistributionFamily: GlmDistributionFamily) (mTol: float) =
QRSolver.solveQrNewton (A: Matrix<float>) (b: Vector<float>) (maxIter: int) (mDistributionFamily: GlmDistributionFamily) (mTol: float)

///
/// Parameters:
/// - A: The design matrix.
/// - b: The response vector.
/// - solved: The solved generalized linear model.
///
/// Returns: The parameter statistics.
let getGLMParameterStatistics (A:Matrix<float>) (b:Vector<float> ) (solved:GLMReturn) =
GLMStatistics.getGLMParameterStatistics A b solved
8 changes: 4 additions & 4 deletions tests/FSharp.Stats.Tests/GeneralisedLinearModels.fs
Original file line number Diff line number Diff line change
Expand Up @@ -911,7 +911,7 @@ let GLMStepwise =
let wlsendogNewExpected = Vector.zeroCreate 10

let costActual,mu_newActual,linPred_newActual,wlsResult_newActual,wlsendogNewActual =
QR.stepwiseGainQR A b mFam t mu linPred oldResults
FSharp.Stats.Fitting.GLM.QRSolver.stepwiseGainQR A b mFam t mu linPred oldResults

for i=0 to (A.NumRows-1) do
Expect.floatClose Accuracy.high mu.[i] muStartExpected.[i] "muStart differs great"
Expand Down Expand Up @@ -946,7 +946,7 @@ let GLMTestsQR =
let cheeseMatrix,cheeseVector = HelperFunctions.generateBaseMatrixAndVector "Taste" [] cheeseframe

let actualResultsRaw =
QR.solveQrNewton cheeseMatrix cheeseVector 200 GlmDistributionFamily.Poisson tolRef
SolveGLM.solveQR cheeseMatrix cheeseVector 200 GlmDistributionFamily.Poisson tolRef
let actualResults = actualResultsRaw.mX

Expect.floatClose Accuracy.medium actualResults.[0] expected.[0] "GLM Intecept wrong"
Expand All @@ -972,7 +972,7 @@ let GLMTestsQR =
let energyMatrix,energyVector = HelperFunctions.generateBaseMatrixAndVector "Energy" [] energyframe

let actualResultsRaw =
QR.solveQrNewton energyMatrix energyVector 200 GlmDistributionFamily.Poisson tolRef
SolveGLM.solveQR energyMatrix energyVector 200 GlmDistributionFamily.Poisson tolRef
let actualResults = actualResultsRaw.mX

Expect.floatClose Accuracy.medium actualResults.[0] expected.[0] "GLM Intecept wrong"
Expand All @@ -999,7 +999,7 @@ let GLMTestsQR =
let lungcapMatrix,lungcapVector = HelperFunctions.generateBaseMatrixAndVector "FEV" [] lungcapframe

let actualResultsRaw =
QR.solveQrNewton lungcapMatrix lungcapVector 200 GlmDistributionFamily.Gamma tolRef
SolveGLM.solveQR lungcapMatrix lungcapVector 200 GlmDistributionFamily.Gamma tolRef
let actualResults = actualResultsRaw.mX

let x = $"{actualResults.[0]} {actualResults.[1]} {actualResults.[2]} {actualResults.[3]} {actualResults.[4]}"
Expand Down