Skip to content

Commit

Permalink
add KNN classifier and logic to ML Unsupervised, add unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
s-weil committed Oct 17, 2023
1 parent 4719e96 commit 4b6f1cf
Show file tree
Hide file tree
Showing 3 changed files with 305 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/FSharp.Stats/FSharp.Stats.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@
<Compile Include="ML\Unsupervised\HierarchicalClustering.fs" />
<Compile Include="ML\Unsupervised\DbScan.fs" />
<Compile Include="ML\Unsupervised\ClusterNumber.fs" />
<Compile Include="ML\Unsupervised\KNN.fs" />
<!-- MISC -->
<Compile Include="FSIPrinters.fs" />
<Compile Include="ConfidenceInterval.fs" />
Expand Down
188 changes: 188 additions & 0 deletions src/FSharp.Stats/ML/Unsupervised/KNN.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
namespace FSharp.Stats.ML.Unsupervised


type LabeledPoint<'a, 'l> = {
p : 'a
label : 'l
}

with
static member create(p, l)= {
p = p
label = l
}

[<RequireQualifiedAccess>]
module KNN =

open FSharp.Stats.DistanceMetrics

module Array =

/// <summary>TODO.</summary>
/// <remarks>May mutate the order of `labeledPoints` and is not thread safe.</remarks>
/// <param name="distance">the distance function, e.g. `euclidean`</param>
/// <param name="labeledPoints">second vector</param>
/// <param name="k">The number of nearest neighbors to look from x</param>
/// <param name="x">The point to classify</param>
/// <returns>The most common labels from the k nearest neighbors for x.</returns>
/// <example>
/// <code>
/// TODO
/// </code>
/// </example>
let inline predict (distance : Distance<'a>) (labeledPoints: LabeledPoint<'a, 'l> array) (k : int) (x: 'a) : 'l option =
if Array.isEmpty labeledPoints || k <= 0 then
None
elif k = 1 then
Some labeledPoints.[0].label
else
labeledPoints |> Array.sortInPlaceBy (fun lp -> distance lp.p x)

let kNearestNeighbors = Array.take k labeledPoints

let label =
kNearestNeighbors
|> Array.countBy (fun lp -> lp.label)
|> Array.maxBy snd
|> fst

Some label

let inline predictInRef<'l when 'l: equality and 'l: comparison>
(distance : Distance<'a>)
(labeledPoints: inref<LabeledPoint<'a, 'l> array>)
(k : int)
(x : 'a)
: 'l option =

if Array.isEmpty labeledPoints || k <= 0 then
None
elif k = 1 then
Some labeledPoints.[0].label
else

let distanceIndices =
labeledPoints
|> Array.mapi (fun idx p -> idx, distance p.p x)

let kNearestNeighborIndices =
distanceIndices
|> Array.sortBy snd // snd = distance value
|> Array.take k

let labels = Array.zeroCreate k

for i in 0..k do
let idx, _ = kNearestNeighborIndices.[i]
let label: 'l = labeledPoints.[idx].label
labels.[i] <- label

let label =
labels
|> Seq.countBy id
|> Seq.maxBy fst
|> fst

Some label



module Seq =

let inline predict<'l when 'l: equality and 'l: comparison>
(distance : Distance<'a>)
(points : 'a seq)
(labels : 'l seq)
(k : int)
(x : 'a)
: 'l option =

if Seq.isEmpty points || Seq.length points <> Seq.length labels || k <= 0 then
None
elif k = 1 then
Some (Seq.head labels)
else

let distanceIndices=
points
|> Seq.mapi (fun idx p -> idx, distance p x)

let kNearestNeighborIndices =
distanceIndices
|> Seq.sortBy snd // snd = distance value
|> Seq.take k

let label =
kNearestNeighborIndices
|> Seq.countBy (fun (idx, _) -> Seq.item idx labels)
|> Seq.maxBy fst
|> fst

Some label

// let inline predict<'l when 'l: equality and 'l: comparison>
// (distance : Distance<'a>)
// (labeledPoints : LabeledPoint<'a, 'l> seq)
// (k : int)
// (x : 'a)
// : 'l option =

// if Seq.isEmpty labeledPoints || k <= 0 then
// None
// elif k = 1 then
// Some (Seq.head labeledPoints).label
// else

// let distanceIndices =
// labeledPoints
// |> Seq.map (fun p -> p, distance p.p x)

// let kNearestNeighborIndices =
// distanceIndices
// |> Seq.sortBy snd // snd = distance value
// |> Seq.take k

// let label =
// kNearestNeighborIndices
// |> Seq.countBy (fun (p, _) -> p.label)
// |> Seq.maxBy fst
// |> fst

// Some label



/// Python Style KNeighborsClassifier
type Classifier<'a, 'l when 'l: equality and 'l: comparison>(distance: Distance<'a>, k: int) =

[<DefaultValue>] val mutable labeledPoints : LabeledPoint<'a, 'l> array
member val K = k with get, set

member this.OverwriteK k =
this.K <- k

member this.fit(lps : LabeledPoint<'a, 'l> array) =
this.labeledPoints <- lps

member this.fit(points : 'a array, labels : 'l array) =
let lps =
(points, labels)
||> Array.zip
|> Array.map LabeledPoint.create<'a, 'l>
this.labeledPoints <- lps

member this.fit(labeledPoints: Map<'l, 'a array>) =
let lps =
labeledPoints
|> Seq.collect (fun (KeyValue(label, points)) ->
points |> Array.map (fun p -> LabeledPoint.create<'a, 'l>(p, label)))
this.labeledPoints <- Seq.toArray lps


member this.predict(x, ?overwriteK) : 'l option =
Array.predict distance this.labeledPoints (defaultArg overwriteK this.K) x

member this.predict(points: 'a array, ?overwriteK) =
let predict = Array.predict distance this.labeledPoints (defaultArg overwriteK this.K)
Array.map predict points
117 changes: 116 additions & 1 deletion tests/FSharp.Stats.Tests/ML.fs
Original file line number Diff line number Diff line change
Expand Up @@ -385,4 +385,119 @@ module hClust =
([(9, 0.0); (10, 0.1414213628); (11, 0.1732050776); (12, 0.2449489683);(13, 0.2645751238); (14, 0.3000000119); (15, 0.4123105705);(16, 0.6164414287)]|> List.map (fun x -> (fst x ,Math.round 10 (snd x ))))

"Distances and Labels won't work "
]
]

module KNN =
open FSharp.Stats.ML.Unsupervised
open FSharp.Stats.ML.Unsupervised.KNN.Array

[<Tests>]
let knnTests =
testList "KNN Tests" [
testCase "blueVsRedPoints" <| fun () ->
let blues =
[|
[ 2.0; 4.0 ]
[ 1.0; 3.0 ]
[ 2.0; 4.0 ]
[ 3.0; 2.0 ]
[ 2.0; 1.0 ]
|] |> Array.map (fun p -> LabeledPoint<float list, string>.create(p, "red"))
let reds =
[|
[ 5.0; 6.0 ]
[ 4.0; 5.0 ]
[ 4.0; 6.0 ]
[ 6.0; 6.0 ]
[ 5.0; 4.0 ]
|] |> Array.map (fun p -> LabeledPoint<float list, string>.create(p, "blue"))

let labeledPoints = Array.append blues reds
let prediction = predict FSharp.Stats.DistanceMetrics.euclidean labeledPoints

let predicted = prediction 3 [3.0; 3.0]

Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "red" "label should be red"

let predicted = prediction 3 [6.0; 6.0]

Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "blue" "label should be blue"

testCase "symmetricallyDistributedPoints" <| fun () ->
let points = Array.init 20 (fun idx -> 0.1 * float idx)

let blues =
points |> Array.map (fun p -> LabeledPoint<float, string>.create(p, "blue"))
let reds =
points |> Array.map (fun p -> LabeledPoint<float, string>.create(-p, "red"))

let labeledPoints = Array.append blues reds

let distance a b = abs (a - b)
let prediction = KNN.Array.predict distance labeledPoints

// '0' is an ambigious case due to the symmetry. may deppend on initial sorting, ...
for sample in 1..100 do
let predicted = prediction 3 (float sample)
Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "blue" "label should be blue"

let predicted = prediction 3 (float -sample)
Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "red" "label should be red"

testCase "symmetricallyDistributedPointsWithClassifier" <| fun () ->
let points = Array.init 200 (fun idx -> 0.1 * float idx)

let labeledPoints = Map [
"blue", points;
"red", points |> Array.map (fun p -> -p)
]

let distance a b = abs (a - b)
let knnClassifier = KNN.Classifier(distance, 5)
knnClassifier.fit(labeledPoints)

let positiveSamples = Array.init 100 (fun idx -> float (idx + 1))
let negativeSamples = Array.init 100 (fun idx -> float -(idx + 1))

let positivePredictions = knnClassifier.predict positiveSamples
let negativePredictions = knnClassifier.predict negativeSamples

(positivePredictions, negativePredictions)
||> Array.zip
|> Array.iter (fun (posLabel, negLabel) ->
Expect.isTrue posLabel.IsSome "Has Label"
Expect.equal posLabel.Value "blue" "label should be blue"

Expect.isTrue negLabel.IsSome "Has Label"
Expect.equal negLabel.Value "red" "label should be red"
)


// testCase "symmetricallyDistributedPointsPARALLEL" <| fun () ->
// let points = Array.init 20 (fun idx -> 0.1 * float idx)

// let blues =
// points |> Array.map (fun p -> LabeledPoint<float, string>.create(p, "blue"))
// let reds =
// points |> Array.map (fun p -> LabeledPoint<float, string>.create(-p, "red"))

// let labeledPoints = Array.append blues reds

// let distance a b = abs (a - b)
// let prediction = KNN.Array.predict distance labeledPoints

// Array.init 200 (fun idx -> 1.0 + float idx * float (sign idx))
// |> Array.Parallel.iter (fun x ->
// let prediction = KNN.Array.Parallel.predictInRef distance &labeledPoints 3 x

// Expect.isTrue prediction.IsSome "Has Label"
// Expect.equal prediction.Value "blue" "label should be blue"
// )


]

0 comments on commit 4b6f1cf

Please sign in to comment.