Skip to content

Commit

Permalink
add new version PCA
Browse files Browse the repository at this point in the history
  • Loading branch information
ZimmerD committed Jun 23, 2022
1 parent 262f1ac commit 8b300c3
Showing 1 changed file with 86 additions and 256 deletions.
342 changes: 86 additions & 256 deletions src/FSharp.Stats/ML/Unsupervised/PrincipalComponentAnalysis.fs
Original file line number Diff line number Diff line change
Expand Up @@ -3,263 +3,93 @@ namespace FSharp.Stats.ML.Unsupervised
open FSharp.Stats


(** Problems:
- How to calculate Loadings/Correlation of Components
- Matrix Adjustment: Does standardisation has to be performend with the whole matrix or original matrix if
supplemental variables are included at transformation.
*)

/// Principle component analysis
module PCA =

/// Represents a principle component
type Component = { // Eigenvectors are columns of matrix Q (loading matrix)
// The elements of Q (somethimes refered as loadings) differs from loadings only by the normalization. The elements of Q (Eigenvectors) are normalized such that
// the sum of the squared elements of a given component is equal to one.
EigenVector : float[];
// EigenValues represent either the variance of the original data contained in each principal component (in case they were computed from the covariance matrix),
// or the amount of correlation captured by the respective principle components (in case they were computed from the correlation matrix)
EigenValue : float;
// Coefficients of correlation
Loadings : float[];
Proportion : float;
CumulativeProportion : float;
Index : int;
}

type AdjustmentDirection =
| Obverse
| Reverse

/// AdjustmentFactorygiven,given a dataset,
// should generate a adjusted dataset
type AdjustmentFactory = AdjustmentDirection -> Matrix<float> -> Matrix<float>

/// Returns an AdjustmentFactory which centers the data
let toAdjustCenter (data:Matrix<float>) : AdjustmentFactory =
let colMeans =
data |> Matrix.meanColumnWise

let adjust (direction:AdjustmentDirection) (aData:Matrix<float>) =
match direction with
| Obverse -> aData |> Matrix.mapi (fun ri ci value -> value - colMeans.[ci] )
| Reverse -> aData |> Matrix.mapi (fun ri ci value -> value + colMeans.[ci] )
adjust


/// Returns an AdjustmentFactory as covariance matrix
let toAdjustCovariance (data:Matrix<float>) : AdjustmentFactory =
let colMeans =
data |> Matrix.meanColumnWise
let sqrtI = sqrt (float data.NumRows)
let adjust (direction:AdjustmentDirection) (aData:Matrix<float>) =
match direction with
| Obverse -> aData |> Matrix.mapi (fun ri ci value -> (value - colMeans.[ci]) / sqrtI )
| Reverse -> aData |> Matrix.mapi (fun ri ci value -> (value * sqrtI) + colMeans.[ci] )
adjust


/// Returns an AdjustmentFactory which centers and standardize the data
let toAdjustStandardize (data:Matrix<float>) : AdjustmentFactory =
let colMeans =
data |> Matrix.meanColumnWise
// Atttention: not entierly shure if space of data before
let colStDev =
data |> Matrix.Generic.mapCols Seq.stDevPopulation |> Seq.toArray

let adjust (direction:AdjustmentDirection) (aData:Matrix<float>) =
match direction with
| Obverse ->
aData
|> Matrix.mapi (fun ri ci value ->
if colStDev.[ci] = 0. then
raise (System.ArgumentException(sprintf "Standard deviation cannot be zero (cannot standardize the constant variable at column index %i" ci))
(value - colMeans.[ci]) / colStDev.[ci] )
| Reverse ->
aData
|> Matrix.mapi (fun ri ci value ->
if colStDev.[ci] = 0. then
raise (System.ArgumentException(sprintf "Standard deviation cannot be zero (cannot standardize the constant variable at column index %i" ci))
(value * colStDev.[ci]) + colMeans.[ci] )
adjust


/// Returns an AdjustmentFactory which centers and standardize the data
let toAdjustCorrelation (data:Matrix<float>) : AdjustmentFactory =
let colMeans =
data |> Matrix.meanColumnWise
let sqrtI = sqrt (float data.NumRows)
// Attention: not entirely sure if space of data before
let colStDev =
data |> Matrix.Generic.mapCols Seq.stDevPopulation |> Seq.toArray
let adjust (direction:AdjustmentDirection) (aData:Matrix<float>) =
match direction with
| Obverse ->
aData
|> Matrix.mapi (fun ri ci value ->
if colStDev.[ci] = 0. then
raise (System.ArgumentException(sprintf "Standard deviation cannot be zero (cannot standardize the constant variable at column index %i" ci))
(value - colMeans.[ci]) / colStDev.[ci] * sqrtI)
| Reverse ->
aData
|> Matrix.mapi (fun ri ci value ->
if colStDev.[ci] = 0. then
raise (System.ArgumentException(sprintf "Standard deviation cannot be zero (cannot standardize the constant variable at column index %i" ci))
(value * colStDev.[ci] / sqrtI) + colMeans.[ci] )
adjust



/// Creates a principle component type
let createComponent eigenVector eigenValue loadings proportion cumulativeProportion index =
{ EigenVector = eigenVector; EigenValue = eigenValue; Loadings = loadings; Proportion = proportion; CumulativeProportion = cumulativeProportion; Index = index; }


/// Creates the principle components of eigenVectors and eigenValues
let private createComponentsOf (eigenVectors: Matrix<float>) (singularValues:seq<float>) =
// Eigenvalues are the square of the singular values
let eigenValues = singularValues |> Seq.map (fun x -> x * x )
// Sort eigenvalues according abs
let sortedEigenValues =
eigenValues
|> Seq.mapi (fun i x -> (i,x))
|> Seq.sortBy (fun (i,ev) -> - (abs ev))
// Calculate proportions of variance
let sumOfEigenValues =
let sum = eigenValues |> Seq.sumBy (fun x -> abs x)
if sum = 0.0 then 0.0 else 1. / sum
// Calculate cumulative proportions of variance
let componentCumulative =
sortedEigenValues
|> Seq.scan (fun (index,cpv) (colI,ev) ->
let componentProportion = (abs ev) * sumOfEigenValues
(index + 1,cpv + componentProportion)) (0,0.0) |> Seq.skip 1

// // Calculate factor structure (not needed because Equals Q)
// let singularValues = eigenValues |> Seq.map (fun ev -> sqrt ev) |> Seq.toArray
// let lM = DiagonalMatrix(singularValues.Length,singularValues.Length,singularValues)
// let fsMatrix = lM * eigenVectors

// Create component type
Seq.map2 (fun (index,cpv) (colI,ev) ->
let componentProportion = (abs ev) * sumOfEigenValues
//createComponent (eigenVectors.Column(colI) |> Seq.toArray) ev (fsMatrix.Column(colI) |> Seq.toArray) componentProportion cpv index) componentCumulative sortedEigenValues
createComponent (eigenVectors.Column(colI) |> Seq.toArray) ev (singularValues |> Seq.toArray) componentProportion cpv index) componentCumulative sortedEigenValues
|> Seq.toArray



///// Computes a principal componant analysis of a given covariance matrix
//let computeOfCovarianceMatrix (covMatrix: #Matrix<float>) =
// // Calculate the eigenvectors and eigenvalues
// let evd = covMatrix.Evd()

// createComponentsOf (evd.EigenVectors) (evd.EigenValues |> Seq.map (fun x -> sqrt x.r))



/// Computes a principal componant analysis of a given covariance matrix
/// !Attention: Matrix needs to be centered before
// The SVD method is used for numerical accuracy
let computeOfMatrix (dataMatrix: Matrix<float>) =
// Perform the Singular Value Decomposition (SVD) of the matrix
let transpose = if dataMatrix.NumRows < dataMatrix.NumCols then true else false

// (umatrix,s,vmatrix) let s,u,v =
let s,u,v =
if transpose then
FSharp.Stats.Algebra.LinearAlgebra.SVD dataMatrix.Transpose
else
FSharp.Stats.Algebra.LinearAlgebra.SVD dataMatrix
let singularValues = s
//printfn "SVD done"
// EigenVectors are the right sigular vectors
let eigenVectors = if transpose then u else FSharp.Stats.Algebra.LinearAlgebra.Inverse v
// // Eigenvalues are the square of the singular values
// let eigenValues = singularValues |> Vector.map (fun x -> x * x )

createComponentsOf eigenVectors singularValues
//The Implementation was compared to the R function prcomp(). The implementation is based on remarks found in https://stats.stackexchange.com/a/134283
//Signs of loadings and principal components (scores) can differ from the R implementation due to different svd implementations being used internally.
//Colab workbook for direct comparison to prcomps output is accessible at: https://colab.research.google.com/drive/1DJ4ky5F5kBM87JprmAbx_gTHqSdz3vqU?usp=sharing

type PCA = {
VarianceOfComponent: vector
///Variance explained by principal components
VarExplainedByComponentIndividual : vector
///Cumulative Variance explained by principal components
VarExplainedByComponentCumulative : vector
///Matrix with columns representing individual principal components ("raw" principal components, projections on principal directions) and rows representing samples.
///Also reffered to as "scores". Corresponds to the attribute "x" of the result object of Rs prcomp() function.
PrincipalComponents : Matrix<float>
///Matrix with columns representing component loadings and rows individual features.
///Corresponds to the attribute "rotation" of the result object of Rs prcomp() function.
Loadings : Matrix<float>
}



/// Computes a principal componant analysis
let compute (adj:AdjustmentFactory) (dataMatrix: Matrix<float>) =
computeOfMatrix (adj AdjustmentDirection.Obverse dataMatrix)


// /// Filter components according to min variance
// let t =
// 1.0

/// Returns feature matrix (eigenvector matrix) from components
let getFeatureMatrixOfComponents (components:Component[]) =
components
|> Seq.map (fun c -> c.EigenVector |> Array.toList)
|> Seq.toList
|> Matrix.ofJaggedColList


/// Returns communality
let getCommunality (components:Component[]) =
let fsMatrix =
components
|> Seq.map (fun c -> c.Loadings |> Array.toList)
|> Seq.toList
|> Matrix.ofJaggedColList
fsMatrix.Transpose * fsMatrix.Diagonal
//fsMatrix.TransposeAndMultiply(fsMatrix).Diagonal().ToArray()


/// Projects a given matrix into principal component space (projections or factor scores)
let transform (adj:AdjustmentFactory) (components:Component[]) (dataMatrix: Matrix<float>) =
let dataM = adj AdjustmentDirection.Obverse dataMatrix
let featureM = getFeatureMatrixOfComponents components

dataM * featureM


/// Reverts a set of projected data into it's original form. Complete reverse
/// transformation is only possible if all components are present, and, if the
/// data has been standardized, the original standard deviation and means of
/// the original matrix are known.
let revert (adj:AdjustmentFactory) (components:Component[]) (dataMatrix: Matrix<float>) =
let featureM = getFeatureMatrixOfComponents components

let revMatrix = dataMatrix * featureM.Transpose
adj AdjustmentDirection.Reverse revMatrix


///// Contribution of an observation to a component
//// High contribution value means contribution of this observation is larger then the average contribution
//let contributionOfTransformed (transformedDataMatrix: #Matrix<float>) =
// // square future matrix
// let sfm = transformedDataMatrix |> Matrix.map (fun x -> x * x)
// // sum over columns
// let colSums = sfm |> Matrix.sumRowsBy (fun coli col -> col)

// sfm
// |> Matrix.mapCols (fun coli col -> col.Divide(colSums.[coli]))


///// Calculates the squared cosine of a component with an observation
//// The squared cosine shows the importance of a component for a given observation
//let importanceOfTransformed (transformedDataMatrix: #Matrix<float>) =
// // square future matrix
// let sfm = transformedDataMatrix |> Matrix.map (fun x -> x * x)
// let d2 = sfm |> Matrix.sumColsBy (fun coli col -> col)

// sfm
// |> Matrix.mapCols (fun coli col -> col.PointwiseDivide(d2))


/// Returns xy-coordinates for scree plot in a tuple (component number vs. EigenValue)
/// Scree plot: represents the ability of PCs to explain de variation in data
let zipScree (components:Component[]) =
components |> Seq.map ( fun c -> (float c.Index,c.EigenValue) )

let zipScores componentIndex1 componentIndex2 (transformedDataMatrix: Matrix<float>) =
Seq.zip (transformedDataMatrix.Column(componentIndex1)) (transformedDataMatrix.Column(componentIndex1))



/// Normalizes each feature by substracting the corresponing mean followed by a division by its standard deviation.
/// The centered features of the matrix are centered around 0 and possess a standard deviation of 1.
/// Expects a data matrix with rows representing observations and columns representing features.
let center m =
let columnMeans =
m
|> Matrix.mapiCols (fun i x -> Seq.mean x)
|> vector

let columnStabw =
m
|> Matrix.mapiCols (fun i x -> Seq.stDev x)
|> vector

let substractionMatrix =
let colV = Vector.init m.NumRows (fun i -> 1.)
colV*columnMeans.Transpose

let stabwMatrix =
let colV = Vector.init m.NumRows (fun i -> 1.)
colV*columnStabw.Transpose

let centeredM =
m - substractionMatrix
|> Matrix.mapi (fun i j x -> x / stabwMatrix.[i,j] )
centeredM

/// Computes the PCA of a column centered data matrix m.
/// Expects a column centered data matrix m, with rows representing observations (a.k.a. samples) and columns representing features.
let compute m =
if m |> Matrix.exists (fun x -> nan.Equals(x) || infinity.Equals(x) || infNeg.Equals(x)) then
failwith "Computation not possible. Matrix contains invalid entries. Check for the existence of values equal to nan, infinity or -infinity."
else
let s,u,v = FSharp.Stats.Algebra.LinearAlgebra.SVD (m)
let n = m.NumRows |> float
let varOfComp =
let e = Vector.map (fun x -> x**2.) s
e
|> Vector.map (fun x -> x / (n-1.))
|> Vector.map sqrt
let varExplained =
let e = Vector.map (fun x -> x**2.) s
let sum = Seq.sum e
Vector.map (fun l -> abs l / sum) e
let varExplainedCum =
varExplained
|> Vector.foldi (fun i (sum,(v:vector)) x ->
let c = x+sum
v.[i] <- c
c,v
) (0.,Vector.zeroCreate varExplained.Length)
|> snd
let pc = u * (Matrix.diag (Vector.init u.NumCols (fun i -> if i <= s.Length-1 then s.[i] else 0.)));
let loadings = Matrix.getCols v.Transpose 0 varExplained.Length
// proper loadings when https://stats.stackexchange.com/a/141531 is right, however the described scaling does
// not result in vectors of norm 1 (the columns of principal axes do) and differ from rs prcomp() "rotation" propertie which is
// described here: https://stats.stackexchange.com/a/510465 and here https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/prcomp as loadings
//let principalAxes = Matrix.getCols v.Transpose 0 varExplained.Length
//let loadings =
// principalAxes*(Matrix.diag (Vector.init principalAxes.NumCols (fun i -> if i <= s.Length-1 then s.[i] else 0.)))
// |> Matrix.map (fun x -> x / sqrt(n-1.))
// Note: Rs biplot function with param scale=0 applies this scaling to loadings: //principalAxes*(Matrix.diag (Vector.init principalAxes.NumCols (fun i -> if i <= s.Length-1 then s.[i] else 0.)))
{
VarianceOfComponent=varOfComp
VarExplainedByComponentIndividual = varExplained
VarExplainedByComponentCumulative = varExplainedCum
PrincipalComponents = pc
Loadings = loadings
}

0 comments on commit 8b300c3

Please sign in to comment.