Skip to content

Commit

Permalink
rename Impute module to Imputation
Browse files Browse the repository at this point in the history
closes #341
  • Loading branch information
bvenn committed Jan 7, 2025
1 parent 57f8939 commit 529c2c0
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 100 deletions.
14 changes: 7 additions & 7 deletions docs/Imputation.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ Missing data imputation based on the k-nearest neighbour algorithm:
*)

// init kNearest MatrixBaseImpute
let kn : Impute.MatrixBaseImputation<float[],float> = Impute.kNearestImpute 2
let imputedData = Impute.imputeBy kn Ops.isNan data
let kn : Imputation.MatrixBaseImputation<float[],float> = Imputation.kNearestImpute 2
let imputedData = Imputation.imputeBy kn Ops.isNan data

(*** hide ***)
let imputedDataMatrix = "k nearest neighbours imputed data\r\n" + FSharp.Stats.FSIPrinters.matrix (matrix imputedData)
Expand All @@ -79,10 +79,10 @@ let imputedDataMatrix = "k nearest neighbours imputed data\r\n" + FSharp.Stats.F
*)

// init random VectorBaseImpute
let rnd = Impute.rnd (System.Random())
let rnd = Imputation.rnd (System.Random())

let rndRowWise = Impute.imputeRowWiseBy rnd Ops.isNan data
let rndColWise = Impute.imputeColWiseBy rnd Ops.isNan data
let rndRowWise = Imputation.imputeRowWiseBy rnd Ops.isNan data
let rndColWise = Imputation.imputeColWiseBy rnd Ops.isNan data

(*** hide ***)
let rndRowDataMatrix = "rndRowDataMatrix imputed data\r\n" + FSharp.Stats.FSIPrinters.matrix (matrix rndRowWise)
Expand All @@ -100,8 +100,8 @@ let rndColDataMatrix = "rndColDataMatrix imputed data\r\n" + FSharp.Stats.FSIPri
*)

let normalRowWise = Impute.imputeRowWiseBy Impute.normal Ops.isNan data
let normalColWise = Impute.imputeColWiseBy Impute.normal Ops.isNan data
let normalRowWise = Imputation.imputeRowWiseBy Imputation.normal Ops.isNan data
let normalColWise = Imputation.imputeColWiseBy Imputation.normal Ops.isNan data


(*** hide ***)
Expand Down
1 change: 1 addition & 0 deletions src/FSharp.Stats/FSharp.Stats.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@
<!-- ML -->
<Compile Include="ML\SurprisalAnalysis.fs" />
<Compile Include="ML\SimilarityMetrics.fs" />
<Compile Include="ML\Imputation.fs" />
<Compile Include="ML\Impute.fs" />
<Compile Include="ML\DistanceMetrics.fs" />
<!-- ML / Unsupervised -->
Expand Down
170 changes: 170 additions & 0 deletions src/FSharp.Stats/ML/Imputation.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
namespace FSharp.Stats.ML

open FSharp.Stats

/// Module for data imputation and missing value filtering
module Imputation =

module Cleaning =

let calcFractionBy (isMissing) (dataRow:seq<'a>) =
dataRow
|> Seq.fold (fun (mc,nmc) state ->
match isMissing state with
| true -> (mc+1,nmc)
| false -> (mc,nmc+1) )
(0,0)
|> fun (mc,nmc) -> float mc / float (nmc + mc)


let removeAllBy f threshold (data:seq<#seq<'a>>) =
data
|> Seq.filter (fun row -> f row <= threshold )


/// Type definintion for a vector based imputation.
/// The imputed values are based only on the given array
type VectorBaseImputation<'a> = seq<'a> -> int -> 'a

/// Type definintion for a vector based imputation
/// The imputed values are based on the given whole dataset
type MatrixBaseImputation<'a,'b> = seq<'a> -> 'a -> int -> 'b


/// <summary>Imputation by random sampling from the input vector</summary>
/// <remarks></remarks>
/// <param name="rnd"></param>
/// <returns></returns>
/// <example>
/// <code>
/// </code>
/// </example>
let rnd (rnd:System.Random) : VectorBaseImputation<'a> =
fun fdata index ->
let farr = Array.ofSeq fdata
if farr.Length < 1 then failwithf "Vector needs at least one non-missing value"
farr.[rnd.Next(0,farr.Length - 1)]


/// Imputation by sampling from a gausian normal distribution based on the input vector
let normal : VectorBaseImputation<float> =
fun fdata index ->
let mean = Seq.mean fdata
let std = Seq.stDev fdata
if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then
Distributions.Continuous.Normal.Sample mean std
else
failwithf "Vector needs at least two non-missing value"


///// Imputation by sampling from a gausian normal distribution based on the input vector
//let normalTruncated : VectorBaseImputation<float> =
// fun fdata index ->
// let mean = Seq.mean fdata
// let std = Seq.stDev fdata
// if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then
// Distributions.Continuous.Normal.Sample mean std
// else
// failwithf "Vector needs at least two non-missing value"


/// <summary>Imputation by k-nearest neighbour</summary>
/// <remarks></remarks>
/// <param name="k"></param>
/// <returns></returns>
/// <example>
/// <code>
/// </code>
/// </example>
let kNearestImpute k : MatrixBaseImputation<float[],float> =
fun data arr index ->

let kNearestFrom (distance:DistanceMetrics.Distance<'a>) k (arr: 'a array) (queryCoordinates:'a) =
arr
|> Array.map (fun t -> (distance t queryCoordinates,t))
|> Array.sortBy fst
|> Array.take k

let euclNanSq = DistanceMetrics.euclideanNaNSquared
let tmpArr =
kNearestFrom euclNanSq k (data |> Array.ofSeq) arr
|> Array.map snd
|> JaggedArray.transpose
|> Array.map Seq.mean
tmpArr.[index]


/// <summary>Imputes column-wise by vector-based imputation</summary>
/// <remarks></remarks>
/// <param name="impute"></param>
/// <param name="isMissing"></param>
/// <param name="data"></param>
/// <returns></returns>
/// <example>
/// <code>
/// </code>
/// </example>
let imputeColWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
data
|> JaggedArray.ofJaggedSeq
|> JaggedArray.transpose
|> Array.map (fun col ->
let fCol = col |> Array.filter (isMissing >> not)
let impute' = impute fCol
col
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
)
|> JaggedArray.transpose


/// <summary>Imputes row-wise by vector-based imputation</summary>
/// <remarks></remarks>
/// <param name="impute"></param>
/// <param name="isMissing"></param>
/// <param name="data"></param>
/// <returns></returns>
/// <example>
/// <code>
/// </code>
/// </example>
let imputeRowWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
data
|> JaggedArray.ofJaggedSeq
|> Array.map (fun row ->
let fRow = row |> Array.filter (isMissing >> not)
let impute' = impute fRow
row
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
)


/// <summary>Imputes rows by matrix-based imputation</summary>
/// <remarks></remarks>
/// <param name="impute"></param>
/// <param name="isMissing"></param>
/// <param name="data"></param>
/// <returns></returns>
/// <example>
/// <code>
/// </code>
/// </example>
let imputeBy (impute: MatrixBaseImputation<'a[],'a>) isMissing data =
let fData =
data
|> Seq.filter (fun row -> row |> Seq.exists isMissing |> not)
|> Seq.map (fun row -> row |> Seq.toArray)
|> Seq.toArray

data
|> JaggedArray.ofJaggedSeq
|> Array.map (fun row ->
let row' = row |> Array.ofSeq
let impute' = impute fData row'
row'
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
)





111 changes: 18 additions & 93 deletions src/FSharp.Stats/ML/Impute.fs
Original file line number Diff line number Diff line change
@@ -1,37 +1,20 @@
namespace FSharp.Stats.ML

open FSharp.Stats
//open FSharp.Care
//open FSharp.Care.Collections
open System

/// Module for data imputation and missing value filtering
[<Obsolete("This module is deprecated. Use FSharp.Stats.Imputation instead")>]
module Impute =

module Cleaning =

let calcFractionBy (isMissing) (dataRow:seq<'a>) =
dataRow
|> Seq.fold (fun (mc,nmc) state ->
match isMissing state with
| true -> (mc+1,nmc)
| false -> (mc,nmc+1) )
(0,0)
|> fun (mc,nmc) -> float mc / float (nmc + mc)
Imputation.Cleaning.calcFractionBy isMissing dataRow


let removeAllBy f threshold (data:seq<#seq<'a>>) =
data
|> Seq.filter (fun row -> f row <= threshold )


/// Type definintion for a vector based imputation.
/// The imputed values are based only on the given array
type VectorBaseImputation<'a> = seq<'a> -> int -> 'a

/// Type definintion for a vector based imputation
/// The imputed values are based on the given whole dataset
type MatrixBaseImputation<'a,'b> = seq<'a> -> 'a -> int -> 'b

Imputation.Cleaning.removeAllBy f threshold data

/// <summary>Imputation by random sampling from the input vector</summary>
/// <remarks></remarks>
Expand All @@ -41,22 +24,13 @@ module Impute =
/// <code>
/// </code>
/// </example>
let rnd (rnd:System.Random) : VectorBaseImputation<'a> =
fun fdata index ->
let farr = Array.ofSeq fdata
if farr.Length < 1 then failwithf "Vector needs at least one non-missing value"
farr.[rnd.Next(0,farr.Length - 1)]
let rnd (rnd:System.Random) : Imputation.VectorBaseImputation<'a> =
Imputation.rnd rnd


/// Imputation by sampling from a gausian normal distribution based on the input vector
let normal : VectorBaseImputation<float> =
fun fdata index ->
let mean = Seq.mean fdata
let std = Seq.stDev fdata
if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then
Distributions.Continuous.Normal.Sample mean std
else
failwithf "Vector needs at least two non-missing value"
let normal: Imputation.VectorBaseImputation<float> =
Imputation.normal


///// Imputation by sampling from a gausian normal distribution based on the input vector
Expand All @@ -78,23 +52,8 @@ module Impute =
/// <code>
/// </code>
/// </example>
let kNearestImpute k : MatrixBaseImputation<float[],float> =
fun data arr index ->

let kNearestFrom (distance:DistanceMetrics.Distance<'a>) k (arr: 'a array) (queryCoordinates:'a) =
arr
|> Array.map (fun t -> (distance t queryCoordinates,t))
|> Array.sortBy fst
|> Array.take k

let euclNanSq = DistanceMetrics.euclideanNaNSquared
let tmpArr =
kNearestFrom euclNanSq k (data |> Array.ofSeq) arr
|> Array.map snd
|> JaggedArray.transpose
|> Array.map Seq.mean
tmpArr.[index]

let kNearestImpute k : Imputation.MatrixBaseImputation<float[],float> =
Imputation.kNearestImpute k

/// <summary>Imputes column-wise by vector-based imputation</summary>
/// <remarks></remarks>
Expand All @@ -106,18 +65,9 @@ module Impute =
/// <code>
/// </code>
/// </example>
let imputeColWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
data
|> JaggedArray.ofJaggedSeq
|> JaggedArray.transpose
|> Array.map (fun col ->
let fCol = col |> Array.filter (isMissing >> not)
let impute' = impute fCol
col
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
)
|> JaggedArray.transpose

let imputeColWiseBy (impute: Imputation.VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
Imputation.imputeColWiseBy impute isMissing data


/// <summary>Imputes row-wise by vector-based imputation</summary>
/// <remarks></remarks>
Expand All @@ -129,15 +79,8 @@ module Impute =
/// <code>
/// </code>
/// </example>
let imputeRowWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
data
|> JaggedArray.ofJaggedSeq
|> Array.map (fun row ->
let fRow = row |> Array.filter (isMissing >> not)
let impute' = impute fRow
row
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
)
let imputeRowWiseBy (impute: Imputation.VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
Imputation.imputeRowWiseBy impute isMissing data


/// <summary>Imputes rows by matrix-based imputation</summary>
Expand All @@ -150,23 +93,5 @@ module Impute =
/// <code>
/// </code>
/// </example>
let imputeBy (impute: MatrixBaseImputation<'a[],'a>) isMissing data =
let fData =
data
|> Seq.filter (fun row -> row |> Seq.exists isMissing |> not)
|> Seq.map (fun row -> row |> Seq.toArray)
|> Seq.toArray

data
|> JaggedArray.ofJaggedSeq
|> Array.map (fun row ->
let row' = row |> Array.ofSeq
let impute' = impute fData row'
row'
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
)





let imputeBy (impute: Imputation.MatrixBaseImputation<'a[],'a>) isMissing data =
Imputation.imputeBy impute isMissing data

0 comments on commit 529c2c0

Please sign in to comment.