From 2eb55cb49d060cb6b2b578bf9c3b7ab286617870 Mon Sep 17 00:00:00 2001 From: Kevin Malenfant Date: Fri, 26 Apr 2024 14:35:35 -0600 Subject: [PATCH 1/8] add correlation test cases --- tests/FSharp.Stats.Tests/Correlation.fs | 240 ++++++++++++++++++++++++ 1 file changed, 240 insertions(+) diff --git a/tests/FSharp.Stats.Tests/Correlation.fs b/tests/FSharp.Stats.Tests/Correlation.fs index dd8886a9e..62503527f 100644 --- a/tests/FSharp.Stats.Tests/Correlation.fs +++ b/tests/FSharp.Stats.Tests/Correlation.fs @@ -3,6 +3,246 @@ open System open FSharp.Stats.Correlation open Expecto +module TestData = + let doubles = + [ + {| + X = [| 0.769975279369337; -0.26975129370715756; -0.22164107602804684; -0.37964372892225584; 1.7976931348623157E+308; 0.6956489946628831; 0.8498674478461568; 0.007870060694074144 |] + Y = [| 8.05529523804792; -9.648443925108909; -1.215500483344818; 5E-324; -4.337558555754166; infinity; -7.497611995486394; -9.039643739188005 |] + Spearman = 0.09523809523809525 + KendallA = 0.07142857142857142 + KendallB = 0.07142857142857142 + KendallC = 0.07142857142857142 + Pearson = nan + |} + {| + X = [| -1.3946407056008117; -1.7976931348623157E+308; 0.02665139354486956; 0.16752887114290516; 0.6510630080261284 |] + Y = [| -5.934146660251358; -7.514325777080982; -2.869708043284536; -0.6743782342678939; -2.2164107602804686 |] + Spearman = 0.9 + KendallA = 0.8 + KendallB = 0.8 + KendallC = 0.8 + Pearson = nan //R returns 0.0 + |} + {| + X = [| -infinity; 3.2160411307302565 |] + Y = [| -3.8511452553484538; -5.393177399524884 |] + Spearman = -1.0 + KendallA = -1.0 + KendallB = -1.0 + KendallC = -1.0 + Pearson = nan + |} + {| + X = [| 5E-324; 0.4310933883901359; 1.1782225200518512; 4.490557012680512; -infinity; -0.05931977813647893 |] + Y = [| -1.7431196366262147; -3.3100232065058477; -infinity; 6.432082261460513; 8.025230948524591; 5E-324 |] + Spearman = -0.42857142857142855 + KendallA = -0.4666666666666667 + KendallB = -0.4666666666666667 + KendallC = -0.4666666666666667 + Pearson = nan + |} + {| + X = [| -0.6237678376055525; -0.02398140791055825; -0.33238783674585126; 5E-324; -0.9617738169271464; -0.6402018172171572; -0.7944049915885085 |] + Y = [| 7.487700704756412; 2.882382571594094; 0.6608761209968983; -1.7976931348623157E+308; 3.7699648024572516; -5.349991331399306; -6.943140018463384 |] + Spearman = -0.28571428571428564 + KendallA = -0.14285714285714285 + KendallB = -0.14285714285714285 + KendallC = -0.14285714285714285 + Pearson = nan // R returns -0.5693503001745431 + |} + {| + X = [| infinity; -0.4380145079632394; 0.2525563106400899; -0.7097994161043718; -infinity; 0.6891193732603421; -1.7976931348623157E+308; 3.3026058744137248 |] + Y = [| -4.619190203598879; -6.830939838589383; 4.262013366906972; -1.719153567018289; -5.8337600091398345; 3.631337095047412; 1.7976931348623157E+308; 1.7976931348623157E+308 |] + Spearman = 0.17964393928698885 + KendallA = 0.10714285714285714 + KendallB = 0.10910894511799618 + KendallC = 0.109375 + Pearson = nan + |} + + {| + X = [| -1.0; 1.0; -3.0; 0.0; 0.0; 2.0; -2.0 |] + Y = [| -3.0; -3.0; 0.0; 2.0; -2.0; -2.0; 1.0 |] + Spearman = -0.35781322366606727 + KendallA = -0.19047619047619047 + KendallB = -0.20519567041703082 + KendallC = -0.20408163265306123 + Pearson = -0.43649077143553344 + |} + {| + X = [| 1.0; 3.0; 3.0; -1.0 |] + Y = [| 3.0; -1.0; -1.0; 1.0 |] + Spearman = -0.7777777777777779 + KendallA = -0.5 + KendallB = -0.5999999999999999 + KendallC = -0.5625 + Pearson = -0.6363636363636365 + |} + {| + X = [| 0.0; 2.0; -2.0; 1.0; 1.0; 3.0; -1.0; 2.0 |] + Y = [| -2.0; -2.0; 1.0; 3.0; 3.0; -1.0; 2.0; -3.0 |] + Spearman = -0.3719512195121951 + KendallA = -0.17857142857142858 + KendallB = -0.19230769230769235 + KendallC = -0.1875 + Pearson = -0.41619003555011974 + |} + {| + X = [| 2.0; -3.0; -3.0; 0.0; 3.0 |] + Y = [| -3.0; 0.0; 0.0; 2.0; -2.0 |] + Spearman = -0.5789473684210528 + KendallA = -0.3 + KendallB = -0.3333333333333334 + KendallC = -0.32 + Pearson = -0.5823356699841468 + |} + {| + X = [| 1.0; 3.0; -1.0; 2.0; 2.0; -2.0; 0.0; 3.0; 3.0 |] + Y = [| -1.0; -1.0; 2.0; -3.0; -3.0; 0.0; 3.0; -2.0; -2.0 |] + Spearman = -0.6293337301361106 + KendallA = -0.3611111111111111 + KendallB = -0.40004734568283135 + KendallC = -0.3851851851851852 + Pearson = -0.6851039625605218 + |} + {| + X = [| 3.0; -2.0; -2.0; 1.0; -3.0; -3.0 |] + Y = [| -2.0; 1.0; 1.0; 3.0; -1.0; 2.0 |] + Spearman = -0.14927035850663303 + KendallA = -0.06666666666666667 + KendallB = -0.07412493166611012 + KendallC = -0.07407407407407407 + Pearson = -0.2631174057921088 + |} + {| + X = [| 2.0; -3.0; 0.0 |] + Y = [| -3.0; 0.0; 3.0 |] + Spearman = -0.5 + KendallA = -0.3333333333333333 + KendallB = -0.33333333333333337 + KendallC = -0.3333333333333333 + Pearson = -0.39735970711951313 + |} + {| + X = [| -3.0; -1.0; -1.0; 2.0; -2.0; -2.0; 0.0 |] + Y = [| -1.0; 2.0; 2.0; -3.0; 0.0; 3.0; 3.0 |] + Spearman = 0.00925925925925926 + KendallA = 0.09523809523809523 + KendallB = 0.10526315789473686 + KendallC = 0.10204081632653061 + Pearson = -0.3150360061726043 + |} + {| + X = [| 2.0; 2.0; -1.0 |] + Y = [| -2.0; 1.0; -3.0 |] + Spearman = 0.8660254037844387 + KendallA = 0.6666666666666666 + KendallB = 0.8164965809277261 + KendallC = 0.8888888888888888 + Pearson = 0.6933752452815365 + |} + {| + X = [| -3.0; 1.0; 0.0; -2.0 |] + Y = [| -2.0; 0.0; 2.0; 2.0 |] + Spearman = 0.31622776601683794 + KendallA = 0.16666666666666666 + KendallB = 0.18257418583505536 + KendallC = 0.1875 + Pearson = 0.3813850356982369 + |} + ] + let ints = + [ + {| + X = [| 1; 3; -1; 2; 2; -2; 0; 3; 3|] + Y = [| -1; -1; 2; -3; -3; 0; 3; -2; -2|] + Spearman = -0.6293337301361106 + KendallA = -0.3611111111111111 + KendallB = -0.40004734568283135 + KendallC = -0.3851851851851852 + Pearson = -0.6851039625605218 + |} + {| + X = [| 3; -2; -2; 1; -3; -3|] + Y = [| -2; 1; 1; 3; -1; 2|] + Spearman = -0.14927035850663303 + KendallA = -0.06666666666666667 + KendallB = -0.07412493166611012 + KendallC = -0.07407407407407407 + Pearson = -0.2631174057921088 + |} + {| + X = [| 2; -3; 0|] + Y = [| -3; 0; 3|] + Spearman = -0.5 + KendallA = -0.3333333333333333 + KendallB = -0.33333333333333337 + KendallC = -0.3333333333333333 + Pearson = -0.39735970711951313 + |} + {| + X = [| -3; -1; -1; 2; -2; -2; 0|] + Y = [| -1; 2; 2; -3; 0; 3; 3|] + Spearman = 0.00925925925925926 + KendallA = 0.09523809523809523 + KendallB = 0.10526315789473686 + KendallC = 0.10204081632653061 + Pearson = -0.3150360061726043 + |} + {| + X = [| 2; 2; -1|] + Y = [| -2; 1; -3|] + Spearman = 0.8660254037844387 + KendallA = 0.6666666666666666 + KendallB = 0.8164965809277261 + KendallC = 0.8888888888888888 + Pearson = 0.6933752452815365 + |} + {| + X = [| -3; 1; 0; -2|] + Y = [| -2; 0; 2; 2|] + Spearman = 0.31622776601683794 + KendallA = 0.16666666666666666 + KendallB = 0.18257418583505536 + KendallC = 0.1875 + Pearson = 0.3813850356982369 + |} + ] + +let inline makeTestList listName caseName corr prop cases = + let getX x = ( ^a : (member X : ^t[]) x) + let getY x = ( ^a : (member Y : ^t[]) x) + cases + |> List.mapi + (fun i x -> + let i = i + 1 + [ + testCase $"{caseName} Case {i}" <| fun () -> + let corr = corr (getX x) (getY x) + if Double.IsNaN (prop x) then + Expect.isTrue (Double.IsNaN corr) "Should be equal (double precision)" + else + Expect.floatClose Accuracy.high corr (prop x) "Should be equal (double precision)" + ] + ) + |> List.concat + |> testList $"Correlation.Seq.{listName}" + +[] +let kendallTauBDoubles = TestData.doubles |> makeTestList "kendall" "Double" Seq.kendall (fun x -> x.KendallB) +[] +let kendallTauBInts = TestData.ints |> makeTestList "kendall" "Int" Seq.kendall (fun x -> x.KendallB) +[] +let pearsonDoubles = TestData.doubles |> makeTestList "pearson" "Double" Seq.pearson (fun x -> x.Pearson) +[] +let pearsonInts = TestData.ints |> makeTestList "pearson" "Int" Seq.pearson (fun x -> x.Pearson) +[] +let spearmanDoubles = TestData.doubles |> makeTestList "spearman" "Double" Seq.spearman (fun x -> x.Spearman) +[] +let spearmanInts = TestData.ints |> makeTestList "spearman" "Int" Seq.spearman (fun x -> x.Spearman) + + [] let kendallCorrelationTests = // tested with R Kendall(x,y) function From f7a326151453a69f0a39dc7db16933bb4a9865e1 Mon Sep 17 00:00:00 2001 From: Kevin Malenfant Date: Sat, 27 Apr 2024 14:57:55 -0600 Subject: [PATCH 2/8] rewrite kendall add tauc and taua variants --- src/FSharp.Stats/Correlation.fs | 265 ++++++++++++++++++++---- tests/FSharp.Stats.Tests/Correlation.fs | 12 +- 2 files changed, 233 insertions(+), 44 deletions(-) diff --git a/src/FSharp.Stats/Correlation.fs b/src/FSharp.Stats/Correlation.fs index 3eab89497..1b9a14516 100644 --- a/src/FSharp.Stats/Correlation.fs +++ b/src/FSharp.Stats/Correlation.fs @@ -282,8 +282,152 @@ module Correlation = |> Seq.map f |> spearmanOfPairs - /// Kendall Correlation Coefficient - /// Computes Kendall rank correlation coefficient between two sequences of observations. + + module internal Kendall = + // x: 'a[] -> y: 'b[] -> pq: float -> n0: int -> n1: int -> n2: int -> 'c + // - x: The first array of observations. + // - y: The second array of observations. + // - pq: Number of concordant minues the number of discordant pairs. + // - n0: he number of pairs of observations. + // - n1: sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. + // - n2: sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + + /// + /// Tau A - Make no adjustments for ties + /// + /// The first array of observations. + /// The second array of observations. + /// Number of concordant minues the number of discordant pairs. + /// The number of pairs of observations. + /// sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. + /// sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + /// The Kendall tau A statistic. + let tauA _x _y pq n0 _n1 _n2 = + pq / float n0 + /// + /// Tau B - Adjust for ties. tau_b = pq / sqrt((n0 - n1)(n0 - n2)) + /// + /// The first array of observations. + /// The second array of observations. + /// Number of concordant minues the number of discordant pairs. + /// The number of pairs of observations. + /// sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. + /// sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + /// The Kendall tau B statistic. + let tauB _x _y pq n0 n1 n2 = + if n0 = n1 || n0 = n2 then nan else + pq / sqrt (float (n0 - n1) * float (n0 - n2)) + /// + /// Tau C - Adjust for ties in x and y. tau_c = 2pq / (n^2 * (m-1)/m) where m = min(distinct x, distinct y) + /// + /// The first array of observations. + /// The second array of observations. + /// Number of concordant minues the number of discordant pairs. + /// The number of pairs of observations. + /// sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. + /// sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + /// The Kendall tau C statistic. + let tauC (x : _[]) y pq _n0 _n1 _n2 = + let n = x.Length + if n = 0 then nan else + let m = min (x |> Seq.distinct |> Seq.length) (y |> Seq.distinct |> Seq.length) |> double + let d = double(n*n)*(m-1.)/m + 2.0*pq / d + + /// + /// Computes the Kendall rank correlation coefficient between two sequences of observations. Tau function is provided as a parameter. + /// + /// + /// The Kendall rank correlation coefficient is a statistic used to measure the ordinal association between two measured quantities. + /// It is a measure of rank correlation: the similarity of the orderings of the data when ranked by each of the quantities. + /// + /// + /// The Kendall tau function to use. x: 'a[] -> y: 'b[] -> pq: float -> n0: int -> n1: int -> n2: int -> 'c + /// - x: The first array of observations. + /// - y: The second array of observations. + /// - pq: Number of concordant minues the number of discordant pairs. + /// - n0: he number of pairs of observations. + /// - n1: sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. + /// - n2: sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + /// The function would generally return the Kendall tau statistic, however, this setup to allow for returning other values, p-values, multiple statistics, etc. + /// + /// The first sequence of observations. + /// The second sequence of observations. + let internal kendallTau tau (x: 'a[]) (y: 'b[]) = + // Kendall's tau using the O(n log n) algorithm of Knight (1966). + // - Initial sort by x, then by y. + // - Count the number of swaps needed to sort by y. + // - Count the number of concordant and discordant pairs. + // - Count the number of ties in x, y, and both. + // - Calculate the tau statistic. + // - tau a: no adjustment for ties. + // - tau b: adjustment for ties in x. + // - tau c: adjustment for ties in x and y. + if x.Length <> y.Length then + invalidArg "y" "The input arrays must have the same length" + elif x.Length = 0 then + tau x y nan 0 0 0 + else + let n = x.Length + let a = [| 0 .. n - 1 |] + let sortedIdx = a |> Array.sortBy (fun a -> x.[a], y.[a]) + let rec mergesort offset length = + match length with + | 1 -> 0 + | 2 -> + if y.[sortedIdx.[offset]] <= y.[sortedIdx.[offset + 1]] then + 0 + else + Array.swapInPlace offset (offset + 1) sortedIdx + 1 + | _ -> + let leftLength = length / 2 + let rightLength = length - leftLength + let middleIndex = offset + leftLength + let swaps = mergesort offset leftLength + mergesort middleIndex rightLength + if y.[sortedIdx.[middleIndex - 1]] < y.[sortedIdx.[middleIndex]] then + swaps + else + let rec merge i r l swaps = + if r < leftLength || l < rightLength then + if l >= rightLength || (r < leftLength && y.[sortedIdx.[offset + r]] <= y.[sortedIdx.[middleIndex + l]]) then + let d = i - r |> max 0 + let swaps = swaps + d + a.[i] <- sortedIdx.[offset + r] + merge (i + 1) (r + 1) l swaps + else + let d = (offset + i) - (middleIndex + l) |> max 0 + let swaps = swaps + d + a.[i] <- sortedIdx.[middleIndex + l] + merge (i + 1) r (l + 1) swaps + else + swaps + let swaps = merge 0 0 0 swaps + Array.blit a 0 sortedIdx offset length + swaps + let tallyTies noTie = + let mutable k = 0 + let mutable sum = 0 + for i in 1 .. n - 1 do + if noTie k i then + sum <- sum + (i - k) * (i - k - 1) / 2 + k <- i + sum + (n - k) * (n - k - 1) / 2 + let n3 = tallyTies (fun k i -> x.[sortedIdx.[k]] <> x.[sortedIdx.[i]] || y.[sortedIdx.[k]] <> y.[sortedIdx.[i]]) + let n1 = tallyTies (fun k i -> x.[sortedIdx.[k]] <> x.[sortedIdx.[i]]) + let swaps = mergesort 0 n + let n2 = tallyTies (fun k i -> y.[sortedIdx.[k]] <> y.[sortedIdx.[i]]) + let n0 = n * (n - 1) / 2 + let pq = ((float (n0 - n1 - n2 + n3)) - 2.0 * float swaps) + tau x y pq n0 n1 n2 + + /// Kendall Correlation Coefficient + /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-a is used to acount for ties. + /// $/tau_a = (n_c - n_d) / n_0$ + /// - $n_c$: number of concordant pairs + /// - $n_d$: number of discordant pairs + /// - $n_0$: number of pairs of observations + /// /// The first sequence of observations. /// The second sequence of observations. /// Kendall rank correlation coefficient of setA and setB @@ -292,51 +436,88 @@ module Correlation = /// let x = [5.05;6.75;3.21;2.66] /// let y = [1.65;26.5;-0.64;6.95] /// - /// Seq.kendall x y // evaluates to 0.3333333333 + /// Seq.kendallTauA x y // evaluates to 0.3333333333 /// /// - let kendall seq1 seq2 = + let kendallTauA seq1 seq2 = let setA = Array.ofSeq seq1 let setB = Array.ofSeq seq2 - let lengthArray = Array.length setA - let inline kendallCorrFun (setA:_[]) (setB:_[]) = - let rec loop i j cCon cDisc cTieA cTieB cPairs = - if i < lengthArray - 1 then - if j <= lengthArray - 1 then - if j > i then - if (setA.[i] > setA.[j] && setB.[i] > setB.[j]) || (setA.[i] < setA.[j] && setB.[i] < setB.[j]) then - loop i (j+1) (cCon + 1.0) cDisc cTieA cTieB (cPairs + 1.0) - - elif (setA.[i] > setA.[j] && setB.[i] < setB.[j]) || (setA.[i] < setA.[j] && setB.[i] > setB.[j]) then - loop i (j+1) cCon (cDisc + 1.0) cTieA cTieB (cPairs + 1.0) - - else - if (setA.[i] = setA.[j]) then - loop i (j+1) cCon cDisc (cTieA + 1.0) cTieB (cPairs + 1.0) - - else - loop i (j+1) cCon cDisc cTieA (cTieB + 1.0) (cPairs + 1.0) - else - loop i (j+1) cCon cDisc cTieA cTieB cPairs - - else - loop (i+1) 1 cCon cDisc cTieA cTieB cPairs - - else - let floatLength = lengthArray |> float - - if (cTieA <> 0.0) || (cTieB <> 0.0) then - let n = (floatLength * (floatLength - 1.0)) / 2.0 - let n1 = (cTieA * (cTieA - 1.0)) / 2.0 - let n2 = (cTieB * (cTieB - 1.0)) / 2.0 - (cCon - cDisc) / (sqrt ((n - n1) * (n - n2))) - - else - (cCon - cDisc) / ((floatLength * (floatLength - 1.0)) / 2.0) - - loop 0 1 0.0 0.0 0.0 0.0 0.0 + kendallTau Kendall.tauA setA setB + + /// Kendall Correlation Coefficient + /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-b is used to acount for ties. + /// $/tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2))$ + /// - $n_c$: number of concordant pairs + /// - $n_d$: number of discordant pairs + /// - $n_0$: number of pairs of observations + /// - $n_1$: number of pairs of observations with the same x value + /// - $n_2$: number of pairs of observations with the same y value + /// + /// The first sequence of observations. + /// The second sequence of observations. + /// Kendall rank correlation coefficient of setA and setB + /// + /// + /// let x = [5.05;6.75;3.21;2.66] + /// let y = [1.65;26.5;-0.64;6.95] + /// + /// Seq.kendallTauB x y // evaluates to 0.3333333333 + /// + /// + let kendallTauB seq1 seq2 = + let setA = Array.ofSeq seq1 + let setB = Array.ofSeq seq2 + kendallTau Kendall.tauB setA setB + + /// Kendall Correlation Coefficient + /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-c is used to acount for ties which is prefered to tau-b when x and y have a different number of possible values. + /// $/tau_c = 2(n_c - n_d) / (n^2 * (m-1)/m)$ + /// - $n_c$: number of concordant pairs + /// - $n_d$: number of discordant pairs + /// - $n_1$: number of pairs of observations with the same x value + /// - $n_2$: number of pairs of observations with the same y value + /// - $m$: min(distinct x, distinct y) + /// + /// The first sequence of observations. + /// The second sequence of observations. + /// Kendall rank correlation coefficient of setA and setB + /// + /// + /// let x = [1;1;1;2;2;2;3;3;3] + /// let y = [2;2;4;4;6;6;8;8;10] + /// + /// Seq.kendallTauA x y // evaluates to 0.7222222222 + /// Seq.kendallTauB x y // evaluates to 0.8845379627 + /// Seq.kendallTauC x y // evaluates to 0.962962963 + /// + /// + let kendallTauC seq1 seq2 = + let setA = Array.ofSeq seq1 + let setB = Array.ofSeq seq2 + kendallTau Kendall.tauC setA setB + + /// Kendall Correlation Coefficient + /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-b is used to acount for ties. + /// $/tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2))$ + /// - $n_c$: number of concordant pairs + /// - $n_d$: number of discordant pairs + /// - $n_0$: number of pairs of observations + /// - $n_1$: number of pairs of observations with the same x value + /// - $n_2$: number of pairs of observations with the same y value + /// + /// The first sequence of observations. + /// The second sequence of observations. + /// Kendall rank correlation coefficient of setA and setB + /// + /// + /// let x = [5.05;6.75;3.21;2.66] + /// let y = [1.65;26.5;-0.64;6.95] + /// + /// Seq.kendall x y // evaluates to 0.3333333333 + /// + /// + let kendall seq1 seq2 = kendallTauB seq1 seq2 - kendallCorrFun (FSharp.Stats.Rank.RankFirst() setA ) (FSharp.Stats.Rank.RankFirst() setB ) /// /// Calculates the kendall correlation coefficient of two samples given as a sequence of paired values. diff --git a/tests/FSharp.Stats.Tests/Correlation.fs b/tests/FSharp.Stats.Tests/Correlation.fs index 62503527f..926952f8f 100644 --- a/tests/FSharp.Stats.Tests/Correlation.fs +++ b/tests/FSharp.Stats.Tests/Correlation.fs @@ -230,9 +230,17 @@ let inline makeTestList listName caseName corr prop cases = |> testList $"Correlation.Seq.{listName}" [] -let kendallTauBDoubles = TestData.doubles |> makeTestList "kendall" "Double" Seq.kendall (fun x -> x.KendallB) +let kendallTauADoubles = TestData.doubles |> makeTestList "kendallTauA" "Double" Seq.kendallTauA (fun x -> x.KendallA) [] -let kendallTauBInts = TestData.ints |> makeTestList "kendall" "Int" Seq.kendall (fun x -> x.KendallB) +let kendallTauAInts = TestData.ints |> makeTestList "kendallTauA" "Int" Seq.kendallTauA (fun x -> x.KendallA) +[] +let kendallTauBDoubles = TestData.doubles |> makeTestList "kendallTauB" "Double" Seq.kendall (fun x -> x.KendallB) +[] +let kendallTauBInts = TestData.ints |> makeTestList "kendallTauB" "Int" Seq.kendall (fun x -> x.KendallB) +[] +let kendallTauCDoubles = TestData.doubles |> makeTestList "kendallTauC" "Double" Seq.kendallTauC (fun x -> x.KendallC) +[] +let kendallTauCInts = TestData.ints |> makeTestList "kendallTauC" "Int" Seq.kendallTauC (fun x -> x.KendallC) [] let pearsonDoubles = TestData.doubles |> makeTestList "pearson" "Double" Seq.pearson (fun x -> x.Pearson) [] From 2a36fd6056d02ff1f85cca7f4a3a69f56151c3da Mon Sep 17 00:00:00 2001 From: Kevin Malenfant Date: Sat, 27 Apr 2024 15:38:27 -0600 Subject: [PATCH 3/8] remove $ markup --- src/FSharp.Stats/Correlation.fs | 43 ++++++++++++++++----------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/src/FSharp.Stats/Correlation.fs b/src/FSharp.Stats/Correlation.fs index 1b9a14516..c4ab55663 100644 --- a/src/FSharp.Stats/Correlation.fs +++ b/src/FSharp.Stats/Correlation.fs @@ -423,10 +423,10 @@ module Correlation = /// Kendall Correlation Coefficient /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-a is used to acount for ties. - /// $/tau_a = (n_c - n_d) / n_0$ - /// - $n_c$: number of concordant pairs - /// - $n_d$: number of discordant pairs - /// - $n_0$: number of pairs of observations + /// tau_a = (n_c - n_d) / n_0 where + /// n_c is the number of concordant pairs, + /// n_d is the number of discordant pairs, and + /// n_0 = n*(n-1)/2 where n is the number of observations. /// /// The first sequence of observations. /// The second sequence of observations. @@ -446,12 +446,12 @@ module Correlation = /// Kendall Correlation Coefficient /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-b is used to acount for ties. - /// $/tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2))$ - /// - $n_c$: number of concordant pairs - /// - $n_d$: number of discordant pairs - /// - $n_0$: number of pairs of observations - /// - $n_1$: number of pairs of observations with the same x value - /// - $n_2$: number of pairs of observations with the same y value + /// tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2)) where + /// n_c is number of concordant pairs, + /// n_d is number of discordant pairs, + /// n_0 = n*(n-1)/2 where n is the number of observations, + /// n_1 = sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value, + /// n_2 = sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. /// /// The first sequence of observations. /// The second sequence of observations. @@ -471,12 +471,11 @@ module Correlation = /// Kendall Correlation Coefficient /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-c is used to acount for ties which is prefered to tau-b when x and y have a different number of possible values. - /// $/tau_c = 2(n_c - n_d) / (n^2 * (m-1)/m)$ - /// - $n_c$: number of concordant pairs - /// - $n_d$: number of discordant pairs - /// - $n_1$: number of pairs of observations with the same x value - /// - $n_2$: number of pairs of observations with the same y value - /// - $m$: min(distinct x, distinct y) + /// tau_c = 2(n_c - n_d) / (n^2 * (m-1)/m) where + /// n_c is number of concordant pairs, + /// n_d is number of discordant pairs, + /// n is the number of observations, + /// m = min(distinct x, distinct y). /// /// The first sequence of observations. /// The second sequence of observations. @@ -498,12 +497,12 @@ module Correlation = /// Kendall Correlation Coefficient /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-b is used to acount for ties. - /// $/tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2))$ - /// - $n_c$: number of concordant pairs - /// - $n_d$: number of discordant pairs - /// - $n_0$: number of pairs of observations - /// - $n_1$: number of pairs of observations with the same x value - /// - $n_2$: number of pairs of observations with the same y value + /// tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2)) where + /// n_c is number of concordant pairs, + /// n_d is number of discordant pairs, + /// n_0 = n*(n-1)/2 where n is the number of observations, + /// n_1 = sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value, + /// n_2 = sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. /// /// The first sequence of observations. /// The second sequence of observations. From 87a2717d99482a923c9d4fc4a851b0ca655a3afb Mon Sep 17 00:00:00 2001 From: Kevin Malenfant Date: Sat, 27 Apr 2024 16:13:55 -0600 Subject: [PATCH 4/8] update docs --- src/FSharp.Stats/Correlation.fs | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/FSharp.Stats/Correlation.fs b/src/FSharp.Stats/Correlation.fs index c4ab55663..65906fff8 100644 --- a/src/FSharp.Stats/Correlation.fs +++ b/src/FSharp.Stats/Correlation.fs @@ -422,7 +422,7 @@ module Correlation = tau x y pq n0 n1 n2 /// Kendall Correlation Coefficient - /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-a is used to acount for ties. + /// Computes Kendall Tau-A rank correlation coefficient between two sequences of observations. No adjustment is made for ties. /// tau_a = (n_c - n_d) / n_0 where /// n_c is the number of concordant pairs, /// n_d is the number of discordant pairs, and @@ -445,17 +445,17 @@ module Correlation = kendallTau Kendall.tauA setA setB /// Kendall Correlation Coefficient - /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-b is used to acount for ties. + /// Computes Kendall Tau-B rank correlation coefficient between two sequences of observations. Tau-b is used to adjust for ties. /// tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2)) where /// n_c is number of concordant pairs, /// n_d is number of discordant pairs, /// n_0 = n*(n-1)/2 where n is the number of observations, - /// n_1 = sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value, - /// n_2 = sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + /// n_1 = sum_i(t_i(t_i-1)/2) where t_i is the number of pairs of observations with the same x value, + /// n_2 = sum_i(u_i(u_i-1)/2) where u_i is the number of pairs of observations with the same y value. /// /// The first sequence of observations. /// The second sequence of observations. - /// Kendall rank correlation coefficient of setA and setB + /// Kendall Tau-B rank correlation coefficient of setA and setB /// /// /// let x = [5.05;6.75;3.21;2.66] @@ -470,7 +470,7 @@ module Correlation = kendallTau Kendall.tauB setA setB /// Kendall Correlation Coefficient - /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-c is used to acount for ties which is prefered to tau-b when x and y have a different number of possible values. + /// Computes Kendall Tau-C rank correlation coefficient between two sequences of observations. Tau-c is used to adjust for ties which is prefered to tau-b when x and y have a different number of possible values. /// tau_c = 2(n_c - n_d) / (n^2 * (m-1)/m) where /// n_c is number of concordant pairs, /// n_d is number of discordant pairs, @@ -479,7 +479,7 @@ module Correlation = /// /// The first sequence of observations. /// The second sequence of observations. - /// Kendall rank correlation coefficient of setA and setB + /// Kendall Tau-C rank correlation coefficient of setA and setB /// /// /// let x = [1;1;1;2;2;2;3;3;3] @@ -496,17 +496,11 @@ module Correlation = kendallTau Kendall.tauC setA setB /// Kendall Correlation Coefficient - /// Computes Kendall rank correlation coefficient between two sequences of observations. Tau-b is used to acount for ties. - /// tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2)) where - /// n_c is number of concordant pairs, - /// n_d is number of discordant pairs, - /// n_0 = n*(n-1)/2 where n is the number of observations, - /// n_1 = sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value, - /// n_2 = sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + /// Computes Kendall Tau-B rank correlation coefficient between two sequences of observations. /// /// The first sequence of observations. /// The second sequence of observations. - /// Kendall rank correlation coefficient of setA and setB + /// Kendall Tau-B rank correlation coefficient of setA and setB /// /// /// let x = [5.05;6.75;3.21;2.66] From 8c2a8e55054c735b7cc1e30b13729fa8efed160a Mon Sep 17 00:00:00 2001 From: Kevin Malenfant Date: Sun, 28 Apr 2024 04:56:21 -0600 Subject: [PATCH 5/8] doc edits --- src/FSharp.Stats/Correlation.fs | 80 ++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/src/FSharp.Stats/Correlation.fs b/src/FSharp.Stats/Correlation.fs index 65906fff8..0fb37d2a8 100644 --- a/src/FSharp.Stats/Correlation.fs +++ b/src/FSharp.Stats/Correlation.fs @@ -302,8 +302,7 @@ module Correlation = /// sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. /// sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. /// The Kendall tau A statistic. - let tauA _x _y pq n0 _n1 _n2 = - pq / float n0 + let tauA _x _y pq n0 _n1 _n2 = pq / float n0 /// /// Tau B - Adjust for ties. tau_b = pq / sqrt((n0 - n1)(n0 - n2)) /// @@ -342,13 +341,15 @@ module Correlation = /// It is a measure of rank correlation: the similarity of the orderings of the data when ranked by each of the quantities. /// /// - /// The Kendall tau function to use. x: 'a[] -> y: 'b[] -> pq: float -> n0: int -> n1: int -> n2: int -> 'c - /// - x: The first array of observations. - /// - y: The second array of observations. - /// - pq: Number of concordant minues the number of discordant pairs. - /// - n0: he number of pairs of observations. - /// - n1: sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. - /// - n2: sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + /// The Kendall tau function to use. x: 'a[] -> y: 'b[] -> pq: float -> n0: int -> n1: int -> n2: int -> 'c + /// + /// x: The first array of observations. + /// y: The second array of observations. + /// pq: Number of concordant minues the number of discordant pairs. + /// n0: The number of pairs of observations. + /// n1: sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. + /// n2: sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. + /// /// The function would generally return the Kendall tau statistic, however, this setup to allow for returning other values, p-values, multiple statistics, etc. /// /// The first sequence of observations. @@ -422,15 +423,17 @@ module Correlation = tau x y pq n0 n1 n2 /// Kendall Correlation Coefficient - /// Computes Kendall Tau-A rank correlation coefficient between two sequences of observations. No adjustment is made for ties. - /// tau_a = (n_c - n_d) / n_0 where - /// n_c is the number of concordant pairs, - /// n_d is the number of discordant pairs, and - /// n_0 = n*(n-1)/2 where n is the number of observations. + /// Computes Kendall Tau-a rank correlation coefficient between two sequences of observations. No adjustment is made for ties. + /// tau_a = (n_c - n_d) / n_0, where + /// + /// n_c: Number of concordant pairs. + /// n_d: Number of discordant pairs. + /// n_0: n*(n-1)/2 where n is the number of observations. + /// /// /// The first sequence of observations. /// The second sequence of observations. - /// Kendall rank correlation coefficient of setA and setB + /// Kendall Tau-a rank correlation coefficient of setA and setB /// /// /// let x = [5.05;6.75;3.21;2.66] @@ -445,17 +448,19 @@ module Correlation = kendallTau Kendall.tauA setA setB /// Kendall Correlation Coefficient - /// Computes Kendall Tau-B rank correlation coefficient between two sequences of observations. Tau-b is used to adjust for ties. - /// tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2)) where - /// n_c is number of concordant pairs, - /// n_d is number of discordant pairs, - /// n_0 = n*(n-1)/2 where n is the number of observations, - /// n_1 = sum_i(t_i(t_i-1)/2) where t_i is the number of pairs of observations with the same x value, - /// n_2 = sum_i(u_i(u_i-1)/2) where u_i is the number of pairs of observations with the same y value. + /// Computes Kendall Tau-b rank correlation coefficient between two sequences of observations. Tau-b is used to adjust for ties. + /// tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2)), where + /// + /// n_c: Number of concordant pairs. + /// n_d: Number of discordant pairs. + /// n_0: n*(n-1)/2 where n is the number of observations. + /// n_1: sum_i(t_i(t_i-1)/2) where t_i is the number of pairs of observations with the same x value. + /// n_2: sum_i(u_i(u_i-1)/2) where u_i is the number of pairs of observations with the same y value. + /// /// /// The first sequence of observations. /// The second sequence of observations. - /// Kendall Tau-B rank correlation coefficient of setA and setB + /// Kendall Tau-b rank correlation coefficient of seq1 and seq2 /// /// /// let x = [5.05;6.75;3.21;2.66] @@ -470,16 +475,18 @@ module Correlation = kendallTau Kendall.tauB setA setB /// Kendall Correlation Coefficient - /// Computes Kendall Tau-C rank correlation coefficient between two sequences of observations. Tau-c is used to adjust for ties which is prefered to tau-b when x and y have a different number of possible values. - /// tau_c = 2(n_c - n_d) / (n^2 * (m-1)/m) where - /// n_c is number of concordant pairs, - /// n_d is number of discordant pairs, - /// n is the number of observations, - /// m = min(distinct x, distinct y). + /// Computes Kendall Tau-c rank correlation coefficient between two sequences of observations. Tau-c is used to adjust for ties which is preferred to Tau-b when x and y have a different number of possible values. + /// tau_c = 2(n_c - n_d) / (n^2 * (m-1)/m), where + /// + /// n_c: Number of concordant pairs. + /// n_d: Number of discordant pairs. + /// n: The number of observations. + /// m: The lesser of the distinct x count and distinct y count. + /// /// /// The first sequence of observations. /// The second sequence of observations. - /// Kendall Tau-C rank correlation coefficient of setA and setB + /// Kendall Tau-c rank correlation coefficient of seq1 and seq2 /// /// /// let x = [1;1;1;2;2;2;3;3;3] @@ -495,12 +502,21 @@ module Correlation = let setB = Array.ofSeq seq2 kendallTau Kendall.tauC setA setB + /// Kendall Correlation Coefficient - /// Computes Kendall Tau-B rank correlation coefficient between two sequences of observations. + /// This is an alias to . Computes Kendall Tau-b rank correlation coefficient between two sequences of observations. Tau-b is used to adjust for ties. + /// tau_b = (n_c - n_d) / sqrt((n_0 - n_1) * (n_0 - n_2)), where + /// + /// n_c: Number of concordant pairs. + /// n_d: Number of discordant pairs. + /// n_0: n*(n-1)/2 where n is the number of observations. + /// n_1: sum_i(t_i(t_i-1)/2) where t_i is the number of pairs of observations with the same x value. + /// n_2: sum_i(u_i(u_i-1)/2) where u_i is the number of pairs of observations with the same y value. + /// /// /// The first sequence of observations. /// The second sequence of observations. - /// Kendall Tau-B rank correlation coefficient of setA and setB + /// Kendall Tau-b rank correlation coefficient of seq1 and seq2 /// /// /// let x = [5.05;6.75;3.21;2.66] From bd891854cfddd534c862518c85a5b250477b080f Mon Sep 17 00:00:00 2001 From: Kevin Malenfant Date: Sun, 28 Apr 2024 05:05:33 -0600 Subject: [PATCH 6/8] doc edits --- src/FSharp.Stats/Correlation.fs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/FSharp.Stats/Correlation.fs b/src/FSharp.Stats/Correlation.fs index 0fb37d2a8..f4ba01482 100644 --- a/src/FSharp.Stats/Correlation.fs +++ b/src/FSharp.Stats/Correlation.fs @@ -361,9 +361,6 @@ module Correlation = // - Count the number of concordant and discordant pairs. // - Count the number of ties in x, y, and both. // - Calculate the tau statistic. - // - tau a: no adjustment for ties. - // - tau b: adjustment for ties in x. - // - tau c: adjustment for ties in x and y. if x.Length <> y.Length then invalidArg "y" "The input arrays must have the same length" elif x.Length = 0 then From dc3a0515fe63d947cedc906684e6f33f3c02fb13 Mon Sep 17 00:00:00 2001 From: Kevin Malenfant Date: Sun, 28 Apr 2024 05:16:36 -0600 Subject: [PATCH 7/8] move length check --- src/FSharp.Stats/Correlation.fs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/FSharp.Stats/Correlation.fs b/src/FSharp.Stats/Correlation.fs index f4ba01482..7f425b2d5 100644 --- a/src/FSharp.Stats/Correlation.fs +++ b/src/FSharp.Stats/Correlation.fs @@ -288,7 +288,7 @@ module Correlation = // - x: The first array of observations. // - y: The second array of observations. // - pq: Number of concordant minues the number of discordant pairs. - // - n0: he number of pairs of observations. + // - n0: n(n-1)/2 or (n choose 2), where n is the number of observations. // - n1: sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. // - n2: sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. @@ -298,7 +298,7 @@ module Correlation = /// The first array of observations. /// The second array of observations. /// Number of concordant minues the number of discordant pairs. - /// The number of pairs of observations. + /// n(n-1)/2 or (n choose 2), where n is the number of observations. /// sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. /// sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. /// The Kendall tau A statistic. @@ -309,7 +309,7 @@ module Correlation = /// The first array of observations. /// The second array of observations. /// Number of concordant minues the number of discordant pairs. - /// The number of pairs of observations. + /// n(n-1)/2 or (n choose 2), where n is the number of observations. /// sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. /// sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. /// The Kendall tau B statistic. @@ -322,7 +322,7 @@ module Correlation = /// The first array of observations. /// The second array of observations. /// Number of concordant minues the number of discordant pairs. - /// The number of pairs of observations. + /// n(n-1)/2 or (n choose 2), where n is the number of observations. /// sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. /// sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. /// The Kendall tau C statistic. @@ -346,7 +346,7 @@ module Correlation = /// x: The first array of observations. /// y: The second array of observations. /// pq: Number of concordant minues the number of discordant pairs. - /// n0: The number of pairs of observations. + /// n0: n(n-1)/2 or (n choose 2), where n is the number of observations. /// n1: sum_i(t_i(t_i-1)/2) where t_is is t_i he number of pairs of observations with the same x value. /// n2: sum_i(u_i(u_i-1)/2) where u_is is u_i he number of pairs of observations with the same y value. /// @@ -361,12 +361,10 @@ module Correlation = // - Count the number of concordant and discordant pairs. // - Count the number of ties in x, y, and both. // - Calculate the tau statistic. - if x.Length <> y.Length then - invalidArg "y" "The input arrays must have the same length" - elif x.Length = 0 then + let n = min x.Length y.Length + if n = 0 then tau x y nan 0 0 0 else - let n = x.Length let a = [| 0 .. n - 1 |] let sortedIdx = a |> Array.sortBy (fun a -> x.[a], y.[a]) let rec mergesort offset length = @@ -442,6 +440,7 @@ module Correlation = let kendallTauA seq1 seq2 = let setA = Array.ofSeq seq1 let setB = Array.ofSeq seq2 + if setB.Length <> setA.Length then invalidArg "seq2" "The input arrays must have the same length" kendallTau Kendall.tauA setA setB /// Kendall Correlation Coefficient @@ -469,6 +468,7 @@ module Correlation = let kendallTauB seq1 seq2 = let setA = Array.ofSeq seq1 let setB = Array.ofSeq seq2 + if setB.Length <> setA.Length then invalidArg "seq2" "The input arrays must have the same length" kendallTau Kendall.tauB setA setB /// Kendall Correlation Coefficient @@ -497,6 +497,7 @@ module Correlation = let kendallTauC seq1 seq2 = let setA = Array.ofSeq seq1 let setB = Array.ofSeq seq2 + if setB.Length <> setA.Length then invalidArg "seq2" "The input arrays must have the same length" kendallTau Kendall.tauC setA setB From f07944eef4da52ff1f83ac3c0c026caeb02b000e Mon Sep 17 00:00:00 2001 From: Kevin Malenfant Date: Mon, 6 May 2024 10:38:28 -0600 Subject: [PATCH 8/8] update docs --- docs/Correlation.fsx | 73 ++++++++++++++++++++++++++++++++- src/FSharp.Stats/Correlation.fs | 6 +-- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/docs/Correlation.fsx b/docs/Correlation.fsx index bbc294535..3b22549dd 100644 --- a/docs/Correlation.fsx +++ b/docs/Correlation.fsx @@ -32,7 +32,7 @@ Plotly.NET.Defaults.DefaultDisplayOptions <- [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/fslaborg/FSharp.Stats/gh-pages?urlpath=/tree/home/jovyan/Correlation.ipynb) [![Notebook]({{root}}img/badge-notebook.svg)]({{root}}{{fsdocs-source-basename}}.ipynb) -_Summary_: This tutorial demonstrates how to autocorrelate a signal in FSharp.Stats +_Summary_: This tutorial demonstrates how to calculate correlation coefficients in FSharp.Stats ### Table of contents @@ -77,6 +77,77 @@ table table |> GenericChart.toChartHTML (***include-it-raw***) +(** + +The [Kendall correlation coefficient](https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient) calculated by `Seq.kendall` is the Kendall Tau-b coefficient. Three variants are available: + +- `Seq.kendallTauA`: Kendall's Tau-a. Defined as: + + $$\tau_a = \frac{n_c - n_d}{n(n-1)/2}$$ + + where $n_c$ is the number of concordant pairs, $n_d$ is the number of discordant pairs, and $n$ is the sample size. Tau-a does not make adjustments for ties. + +- `Seq.kendallTauB`: Kendall's Tau-b (this is the default used by `Seq.kendall`). Defined as: + + $$\tau_b = \frac{n_c - n_d}{\sqrt{(n_0 - n_1)(n_0 - n_2)}}$$ + + where $n_0 = n(n-1)/2$, $n_1 = \sum_i t_i(t_i-1)/2$, and $n_2 = \sum_j u_j(u_j-1)/2$. Here $t_i$ is the number of tied values in the $i$th group of ties for the first quantity and $u_j$ is the number of tied values in the $j$th group of ties for the second quantity. Tau-b makes adjustments for ties. + +- `Seq.kendallTauC`: Kendall's Tau-c. Defined as: + + $$\tau_c = \frac{2(n_c - n_d)}{n^2(m-1)/m}$$ + + where $m = \min(r,s)$ and $r$ and $s$ are the number of distinct items in each sequence. Tau-c makes an adjustment for set size in addition to ties. + +Here's an example illustrating the differences: + +*) + +// Sequences with no ties +let seqA = [1. .. 10.0] +let seqB = seqA |> List.map sin + +let noTiesTauA = Seq.kendallTauA seqA seqB +let noTiesTauB = Seq.kendallTauB seqA seqB +let noTiesTauC = Seq.kendallTauC seqA seqB + +// Sequences with ties +let seqC = [1.;2.;2.;3.;4.] +let seqD = [1.;1.;1.;4.;4.] + +let tiesTauA = Seq.kendallTauA seqC seqD +let tiesTauB = Seq.kendallTauB seqC seqD +let tiesTauC = Seq.kendallTauC seqC seqD + +let tableKendall = + let header = ["Correlation measure";"value"] + let rows = + [ + ["Tau-a (no ties)"; sprintf "%3f" noTiesTauA] + ["Tau-b (no ties)"; sprintf "%3f" noTiesTauB] + ["Tau-c (no ties)"; sprintf "%3f" noTiesTauC] + ["Tau-a (ties)"; sprintf "%3f" tiesTauA] + ["Tau-b (ties)"; sprintf "%3f" tiesTauB] + ["Tau-c (ties)"; sprintf "%3f" tiesTauC] + ] + Chart.Table(header, rows, HeaderFillColor = Color.fromHex "#deebf7", CellsFillColor= Color.fromString "lightgrey") + +(*** condition: ipynb ***) +#if IPYNB +tableKendall +#endif // IPYNB + +(***hide***) +tableKendall |> GenericChart.toChartHTML +(***include-it-raw***) + +(** + +As seen, when there are no ties, all three variants give the same result. But with ties present, Tau-b and Tau-c make adjustments and can give different values from Tau-a. `Seq.kendall` uses Tau-b as it is the most commonly used variant. + +*) + + (** ## Matrix correlations diff --git a/src/FSharp.Stats/Correlation.fs b/src/FSharp.Stats/Correlation.fs index 7f425b2d5..40c122c5a 100644 --- a/src/FSharp.Stats/Correlation.fs +++ b/src/FSharp.Stats/Correlation.fs @@ -440,7 +440,7 @@ module Correlation = let kendallTauA seq1 seq2 = let setA = Array.ofSeq seq1 let setB = Array.ofSeq seq2 - if setB.Length <> setA.Length then invalidArg "seq2" "The input arrays must have the same length" + if setB.Length <> setA.Length then invalidArg "seq2" "The input sequences must have the same length" kendallTau Kendall.tauA setA setB /// Kendall Correlation Coefficient @@ -468,7 +468,7 @@ module Correlation = let kendallTauB seq1 seq2 = let setA = Array.ofSeq seq1 let setB = Array.ofSeq seq2 - if setB.Length <> setA.Length then invalidArg "seq2" "The input arrays must have the same length" + if setB.Length <> setA.Length then invalidArg "seq2" "The input sequences must have the same length" kendallTau Kendall.tauB setA setB /// Kendall Correlation Coefficient @@ -497,7 +497,7 @@ module Correlation = let kendallTauC seq1 seq2 = let setA = Array.ofSeq seq1 let setB = Array.ofSeq seq2 - if setB.Length <> setA.Length then invalidArg "seq2" "The input arrays must have the same length" + if setB.Length <> setA.Length then invalidArg "seq2" "The input sequences must have the same length" kendallTau Kendall.tauC setA setB