Skip to content

Commit 529c2c0

Browse files
committed
rename Impute module to Imputation
closes #341
1 parent 57f8939 commit 529c2c0

File tree

4 files changed

+196
-100
lines changed

4 files changed

+196
-100
lines changed

docs/Imputation.fsx

+7-7
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ Missing data imputation based on the k-nearest neighbour algorithm:
6262
*)
6363

6464
// init kNearest MatrixBaseImpute
65-
let kn : Impute.MatrixBaseImputation<float[],float> = Impute.kNearestImpute 2
66-
let imputedData = Impute.imputeBy kn Ops.isNan data
65+
let kn : Imputation.MatrixBaseImputation<float[],float> = Imputation.kNearestImpute 2
66+
let imputedData = Imputation.imputeBy kn Ops.isNan data
6767

6868
(*** hide ***)
6969
let imputedDataMatrix = "k nearest neighbours imputed data\r\n" + FSharp.Stats.FSIPrinters.matrix (matrix imputedData)
@@ -79,10 +79,10 @@ let imputedDataMatrix = "k nearest neighbours imputed data\r\n" + FSharp.Stats.F
7979
*)
8080

8181
// init random VectorBaseImpute
82-
let rnd = Impute.rnd (System.Random())
82+
let rnd = Imputation.rnd (System.Random())
8383

84-
let rndRowWise = Impute.imputeRowWiseBy rnd Ops.isNan data
85-
let rndColWise = Impute.imputeColWiseBy rnd Ops.isNan data
84+
let rndRowWise = Imputation.imputeRowWiseBy rnd Ops.isNan data
85+
let rndColWise = Imputation.imputeColWiseBy rnd Ops.isNan data
8686

8787
(*** hide ***)
8888
let rndRowDataMatrix = "rndRowDataMatrix imputed data\r\n" + FSharp.Stats.FSIPrinters.matrix (matrix rndRowWise)
@@ -100,8 +100,8 @@ let rndColDataMatrix = "rndColDataMatrix imputed data\r\n" + FSharp.Stats.FSIPri
100100
101101
*)
102102

103-
let normalRowWise = Impute.imputeRowWiseBy Impute.normal Ops.isNan data
104-
let normalColWise = Impute.imputeColWiseBy Impute.normal Ops.isNan data
103+
let normalRowWise = Imputation.imputeRowWiseBy Imputation.normal Ops.isNan data
104+
let normalColWise = Imputation.imputeColWiseBy Imputation.normal Ops.isNan data
105105

106106

107107
(*** hide ***)

src/FSharp.Stats/FSharp.Stats.fsproj

+1
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@
156156
<!-- ML -->
157157
<Compile Include="ML\SurprisalAnalysis.fs" />
158158
<Compile Include="ML\SimilarityMetrics.fs" />
159+
<Compile Include="ML\Imputation.fs" />
159160
<Compile Include="ML\Impute.fs" />
160161
<Compile Include="ML\DistanceMetrics.fs" />
161162
<!-- ML / Unsupervised -->

src/FSharp.Stats/ML/Imputation.fs

+170
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
namespace FSharp.Stats.ML
2+
3+
open FSharp.Stats
4+
5+
/// Module for data imputation and missing value filtering
6+
module Imputation =
7+
8+
module Cleaning =
9+
10+
let calcFractionBy (isMissing) (dataRow:seq<'a>) =
11+
dataRow
12+
|> Seq.fold (fun (mc,nmc) state ->
13+
match isMissing state with
14+
| true -> (mc+1,nmc)
15+
| false -> (mc,nmc+1) )
16+
(0,0)
17+
|> fun (mc,nmc) -> float mc / float (nmc + mc)
18+
19+
20+
let removeAllBy f threshold (data:seq<#seq<'a>>) =
21+
data
22+
|> Seq.filter (fun row -> f row <= threshold )
23+
24+
25+
/// Type definintion for a vector based imputation.
26+
/// The imputed values are based only on the given array
27+
type VectorBaseImputation<'a> = seq<'a> -> int -> 'a
28+
29+
/// Type definintion for a vector based imputation
30+
/// The imputed values are based on the given whole dataset
31+
type MatrixBaseImputation<'a,'b> = seq<'a> -> 'a -> int -> 'b
32+
33+
34+
/// <summary>Imputation by random sampling from the input vector</summary>
35+
/// <remarks></remarks>
36+
/// <param name="rnd"></param>
37+
/// <returns></returns>
38+
/// <example>
39+
/// <code>
40+
/// </code>
41+
/// </example>
42+
let rnd (rnd:System.Random) : VectorBaseImputation<'a> =
43+
fun fdata index ->
44+
let farr = Array.ofSeq fdata
45+
if farr.Length < 1 then failwithf "Vector needs at least one non-missing value"
46+
farr.[rnd.Next(0,farr.Length - 1)]
47+
48+
49+
/// Imputation by sampling from a gausian normal distribution based on the input vector
50+
let normal : VectorBaseImputation<float> =
51+
fun fdata index ->
52+
let mean = Seq.mean fdata
53+
let std = Seq.stDev fdata
54+
if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then
55+
Distributions.Continuous.Normal.Sample mean std
56+
else
57+
failwithf "Vector needs at least two non-missing value"
58+
59+
60+
///// Imputation by sampling from a gausian normal distribution based on the input vector
61+
//let normalTruncated : VectorBaseImputation<float> =
62+
// fun fdata index ->
63+
// let mean = Seq.mean fdata
64+
// let std = Seq.stDev fdata
65+
// if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then
66+
// Distributions.Continuous.Normal.Sample mean std
67+
// else
68+
// failwithf "Vector needs at least two non-missing value"
69+
70+
71+
/// <summary>Imputation by k-nearest neighbour</summary>
72+
/// <remarks></remarks>
73+
/// <param name="k"></param>
74+
/// <returns></returns>
75+
/// <example>
76+
/// <code>
77+
/// </code>
78+
/// </example>
79+
let kNearestImpute k : MatrixBaseImputation<float[],float> =
80+
fun data arr index ->
81+
82+
let kNearestFrom (distance:DistanceMetrics.Distance<'a>) k (arr: 'a array) (queryCoordinates:'a) =
83+
arr
84+
|> Array.map (fun t -> (distance t queryCoordinates,t))
85+
|> Array.sortBy fst
86+
|> Array.take k
87+
88+
let euclNanSq = DistanceMetrics.euclideanNaNSquared
89+
let tmpArr =
90+
kNearestFrom euclNanSq k (data |> Array.ofSeq) arr
91+
|> Array.map snd
92+
|> JaggedArray.transpose
93+
|> Array.map Seq.mean
94+
tmpArr.[index]
95+
96+
97+
/// <summary>Imputes column-wise by vector-based imputation</summary>
98+
/// <remarks></remarks>
99+
/// <param name="impute"></param>
100+
/// <param name="isMissing"></param>
101+
/// <param name="data"></param>
102+
/// <returns></returns>
103+
/// <example>
104+
/// <code>
105+
/// </code>
106+
/// </example>
107+
let imputeColWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
108+
data
109+
|> JaggedArray.ofJaggedSeq
110+
|> JaggedArray.transpose
111+
|> Array.map (fun col ->
112+
let fCol = col |> Array.filter (isMissing >> not)
113+
let impute' = impute fCol
114+
col
115+
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
116+
)
117+
|> JaggedArray.transpose
118+
119+
120+
/// <summary>Imputes row-wise by vector-based imputation</summary>
121+
/// <remarks></remarks>
122+
/// <param name="impute"></param>
123+
/// <param name="isMissing"></param>
124+
/// <param name="data"></param>
125+
/// <returns></returns>
126+
/// <example>
127+
/// <code>
128+
/// </code>
129+
/// </example>
130+
let imputeRowWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
131+
data
132+
|> JaggedArray.ofJaggedSeq
133+
|> Array.map (fun row ->
134+
let fRow = row |> Array.filter (isMissing >> not)
135+
let impute' = impute fRow
136+
row
137+
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
138+
)
139+
140+
141+
/// <summary>Imputes rows by matrix-based imputation</summary>
142+
/// <remarks></remarks>
143+
/// <param name="impute"></param>
144+
/// <param name="isMissing"></param>
145+
/// <param name="data"></param>
146+
/// <returns></returns>
147+
/// <example>
148+
/// <code>
149+
/// </code>
150+
/// </example>
151+
let imputeBy (impute: MatrixBaseImputation<'a[],'a>) isMissing data =
152+
let fData =
153+
data
154+
|> Seq.filter (fun row -> row |> Seq.exists isMissing |> not)
155+
|> Seq.map (fun row -> row |> Seq.toArray)
156+
|> Seq.toArray
157+
158+
data
159+
|> JaggedArray.ofJaggedSeq
160+
|> Array.map (fun row ->
161+
let row' = row |> Array.ofSeq
162+
let impute' = impute fData row'
163+
row'
164+
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
165+
)
166+
167+
168+
169+
170+

src/FSharp.Stats/ML/Impute.fs

+18-93
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,20 @@
11
namespace FSharp.Stats.ML
22

33
open FSharp.Stats
4-
//open FSharp.Care
5-
//open FSharp.Care.Collections
4+
open System
65

76
/// Module for data imputation and missing value filtering
7+
[<Obsolete("This module is deprecated. Use FSharp.Stats.Imputation instead")>]
88
module Impute =
99

1010
module Cleaning =
11-
11+
1212
let calcFractionBy (isMissing) (dataRow:seq<'a>) =
13-
dataRow
14-
|> Seq.fold (fun (mc,nmc) state ->
15-
match isMissing state with
16-
| true -> (mc+1,nmc)
17-
| false -> (mc,nmc+1) )
18-
(0,0)
19-
|> fun (mc,nmc) -> float mc / float (nmc + mc)
13+
Imputation.Cleaning.calcFractionBy isMissing dataRow
2014

2115

2216
let removeAllBy f threshold (data:seq<#seq<'a>>) =
23-
data
24-
|> Seq.filter (fun row -> f row <= threshold )
25-
26-
27-
/// Type definintion for a vector based imputation.
28-
/// The imputed values are based only on the given array
29-
type VectorBaseImputation<'a> = seq<'a> -> int -> 'a
30-
31-
/// Type definintion for a vector based imputation
32-
/// The imputed values are based on the given whole dataset
33-
type MatrixBaseImputation<'a,'b> = seq<'a> -> 'a -> int -> 'b
34-
17+
Imputation.Cleaning.removeAllBy f threshold data
3518

3619
/// <summary>Imputation by random sampling from the input vector</summary>
3720
/// <remarks></remarks>
@@ -41,22 +24,13 @@ module Impute =
4124
/// <code>
4225
/// </code>
4326
/// </example>
44-
let rnd (rnd:System.Random) : VectorBaseImputation<'a> =
45-
fun fdata index ->
46-
let farr = Array.ofSeq fdata
47-
if farr.Length < 1 then failwithf "Vector needs at least one non-missing value"
48-
farr.[rnd.Next(0,farr.Length - 1)]
27+
let rnd (rnd:System.Random) : Imputation.VectorBaseImputation<'a> =
28+
Imputation.rnd rnd
4929

5030

5131
/// Imputation by sampling from a gausian normal distribution based on the input vector
52-
let normal : VectorBaseImputation<float> =
53-
fun fdata index ->
54-
let mean = Seq.mean fdata
55-
let std = Seq.stDev fdata
56-
if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then
57-
Distributions.Continuous.Normal.Sample mean std
58-
else
59-
failwithf "Vector needs at least two non-missing value"
32+
let normal: Imputation.VectorBaseImputation<float> =
33+
Imputation.normal
6034

6135

6236
///// Imputation by sampling from a gausian normal distribution based on the input vector
@@ -78,23 +52,8 @@ module Impute =
7852
/// <code>
7953
/// </code>
8054
/// </example>
81-
let kNearestImpute k : MatrixBaseImputation<float[],float> =
82-
fun data arr index ->
83-
84-
let kNearestFrom (distance:DistanceMetrics.Distance<'a>) k (arr: 'a array) (queryCoordinates:'a) =
85-
arr
86-
|> Array.map (fun t -> (distance t queryCoordinates,t))
87-
|> Array.sortBy fst
88-
|> Array.take k
89-
90-
let euclNanSq = DistanceMetrics.euclideanNaNSquared
91-
let tmpArr =
92-
kNearestFrom euclNanSq k (data |> Array.ofSeq) arr
93-
|> Array.map snd
94-
|> JaggedArray.transpose
95-
|> Array.map Seq.mean
96-
tmpArr.[index]
97-
55+
let kNearestImpute k : Imputation.MatrixBaseImputation<float[],float> =
56+
Imputation.kNearestImpute k
9857

9958
/// <summary>Imputes column-wise by vector-based imputation</summary>
10059
/// <remarks></remarks>
@@ -106,18 +65,9 @@ module Impute =
10665
/// <code>
10766
/// </code>
10867
/// </example>
109-
let imputeColWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
110-
data
111-
|> JaggedArray.ofJaggedSeq
112-
|> JaggedArray.transpose
113-
|> Array.map (fun col ->
114-
let fCol = col |> Array.filter (isMissing >> not)
115-
let impute' = impute fCol
116-
col
117-
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
118-
)
119-
|> JaggedArray.transpose
120-
68+
let imputeColWiseBy (impute: Imputation.VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
69+
Imputation.imputeColWiseBy impute isMissing data
70+
12171

12272
/// <summary>Imputes row-wise by vector-based imputation</summary>
12373
/// <remarks></remarks>
@@ -129,15 +79,8 @@ module Impute =
12979
/// <code>
13080
/// </code>
13181
/// </example>
132-
let imputeRowWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
133-
data
134-
|> JaggedArray.ofJaggedSeq
135-
|> Array.map (fun row ->
136-
let fRow = row |> Array.filter (isMissing >> not)
137-
let impute' = impute fRow
138-
row
139-
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
140-
)
82+
let imputeRowWiseBy (impute: Imputation.VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) =
83+
Imputation.imputeRowWiseBy impute isMissing data
14184

14285

14386
/// <summary>Imputes rows by matrix-based imputation</summary>
@@ -150,23 +93,5 @@ module Impute =
15093
/// <code>
15194
/// </code>
15295
/// </example>
153-
let imputeBy (impute: MatrixBaseImputation<'a[],'a>) isMissing data =
154-
let fData =
155-
data
156-
|> Seq.filter (fun row -> row |> Seq.exists isMissing |> not)
157-
|> Seq.map (fun row -> row |> Seq.toArray)
158-
|> Seq.toArray
159-
160-
data
161-
|> JaggedArray.ofJaggedSeq
162-
|> Array.map (fun row ->
163-
let row' = row |> Array.ofSeq
164-
let impute' = impute fData row'
165-
row'
166-
|> Array.mapi (fun i v -> if isMissing v then (impute' i) else v)
167-
)
168-
169-
170-
171-
172-
96+
let imputeBy (impute: Imputation.MatrixBaseImputation<'a[],'a>) isMissing data =
97+
Imputation.imputeBy impute isMissing data

0 commit comments

Comments
 (0)