Skip to content

Commit

Permalink
Add cumulative option for the new statistics.kde() function. (python#…
Browse files Browse the repository at this point in the history
  • Loading branch information
rhettinger authored and adorilson committed Mar 25, 2024
1 parent b8c7c6a commit efb1c33
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 21 deletions.
13 changes: 8 additions & 5 deletions Doc/library/statistics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -261,11 +261,12 @@ However, for reading convenience, most of the examples show sorted sequences.
Added support for *weights*.


.. function:: kde(data, h, kernel='normal')
.. function:: kde(data, h, kernel='normal', *, cumulative=False)

`Kernel Density Estimation (KDE)
<https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf>`_:
Create a continuous probability density function from discrete samples.
Create a continuous probability density function or cumulative
distribution function from discrete samples.

The basic idea is to smooth the data using `a kernel function
<https://en.wikipedia.org/wiki/Kernel_(statistics)>`_.
Expand All @@ -280,11 +281,13 @@ However, for reading convenience, most of the examples show sorted sequences.
as much as the more influential bandwidth smoothing parameter.

Kernels that give some weight to every sample point include
*normal* or *gauss*, *logistic*, and *sigmoid*.
*normal* (*gauss*), *logistic*, and *sigmoid*.

Kernels that only give weight to sample points within the bandwidth
include *rectangular* or *uniform*, *triangular*, *parabolic* or
*epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*.
include *rectangular* (*uniform*), *triangular*, *parabolic*
(*epanechnikov*), *quartic* (*biweight*), *triweight*, and *cosine*.

If *cumulative* is true, will return a cumulative distribution function.

A :exc:`StatisticsError` will be raised if the *data* sequence is empty.

Expand Down
67 changes: 52 additions & 15 deletions Lib/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@
from itertools import count, groupby, repeat
from bisect import bisect_left, bisect_right
from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod
from math import isfinite, isinf, pi, cos, cosh
from math import isfinite, isinf, pi, cos, sin, cosh, atan
from functools import reduce
from operator import itemgetter
from collections import Counter, namedtuple, defaultdict
Expand Down Expand Up @@ -803,9 +803,9 @@ def multimode(data):
return [value for value, count in counts.items() if count == maxcount]


def kde(data, h, kernel='normal'):
"""Kernel Density Estimation: Create a continuous probability
density function from discrete samples.
def kde(data, h, kernel='normal', *, cumulative=False):
"""Kernel Density Estimation: Create a continuous probability density
function or cumulative distribution function from discrete samples.
The basic idea is to smooth the data using a kernel function
to help draw inferences about a population from a sample.
Expand All @@ -820,20 +820,22 @@ def kde(data, h, kernel='normal'):
Kernels that give some weight to every sample point:
normal or gauss
normal (gauss)
logistic
sigmoid
Kernels that only give weight to sample points within
the bandwidth:
rectangular or uniform
rectangular (uniform)
triangular
parabolic or epanechnikov
quartic or biweight
parabolic (epanechnikov)
quartic (biweight)
triweight
cosine
If *cumulative* is true, will return a cumulative distribution function.
A StatisticsError will be raised if the data sequence is empty.
Example
Expand All @@ -847,7 +849,8 @@ def kde(data, h, kernel='normal'):
Compute the area under the curve:
>>> sum(f_hat(x) for x in range(-20, 20))
>>> area = sum(f_hat(x) for x in range(-20, 20))
>>> round(area, 4)
1.0
Plot the estimated probability density function at
Expand Down Expand Up @@ -876,6 +879,13 @@ def kde(data, h, kernel='normal'):
9: 0.009 x
10: 0.002 x
Estimate P(4.5 < X <= 7.5), the probability that a new sample value
will be between 4.5 and 7.5:
>>> cdf = kde(sample, h=1.5, cumulative=True)
>>> round(cdf(7.5) - cdf(4.5), 2)
0.22
References
----------
Expand All @@ -888,6 +898,9 @@ def kde(data, h, kernel='normal'):
Interactive graphical demonstration and exploration:
https://demonstrations.wolfram.com/KernelDensityEstimation/
Kernel estimation of cumulative distribution function of a random variable with bounded support
https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf
"""

n = len(data)
Expand All @@ -903,45 +916,56 @@ def kde(data, h, kernel='normal'):
match kernel:

case 'normal' | 'gauss':
c = 1 / sqrt(2 * pi)
K = lambda t: c * exp(-1/2 * t * t)
sqrt2pi = sqrt(2 * pi)
sqrt2 = sqrt(2)
K = lambda t: exp(-1/2 * t * t) / sqrt2pi
I = lambda t: 1/2 * (1.0 + erf(t / sqrt2))
support = None

case 'logistic':
# 1.0 / (exp(t) + 2.0 + exp(-t))
K = lambda t: 1/2 / (1.0 + cosh(t))
I = lambda t: 1.0 - 1.0 / (exp(t) + 1.0)
support = None

case 'sigmoid':
# (2/pi) / (exp(t) + exp(-t))
c = 1 / pi
K = lambda t: c / cosh(t)
c1 = 1 / pi
c2 = 2 / pi
K = lambda t: c1 / cosh(t)
I = lambda t: c2 * atan(exp(t))
support = None

case 'rectangular' | 'uniform':
K = lambda t: 1/2
I = lambda t: 1/2 * t + 1/2
support = 1.0

case 'triangular':
K = lambda t: 1.0 - abs(t)
I = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2
support = 1.0

case 'parabolic' | 'epanechnikov':
K = lambda t: 3/4 * (1.0 - t * t)
I = lambda t: -1/4 * t**3 + 3/4 * t + 1/2
support = 1.0

case 'quartic' | 'biweight':
K = lambda t: 15/16 * (1.0 - t * t) ** 2
I = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2
support = 1.0

case 'triweight':
K = lambda t: 35/32 * (1.0 - t * t) ** 3
I = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2
support = 1.0

case 'cosine':
c1 = pi / 4
c2 = pi / 2
K = lambda t: c1 * cos(c2 * t)
I = lambda t: 1/2 * sin(c2 * t) + 1/2
support = 1.0

case _:
Expand All @@ -952,6 +976,9 @@ def kde(data, h, kernel='normal'):
def pdf(x):
return sum(K((x - x_i) / h) for x_i in data) / (n * h)

def cdf(x):
return sum(I((x - x_i) / h) for x_i in data) / n

else:

sample = sorted(data)
Expand All @@ -963,9 +990,19 @@ def pdf(x):
supported = sample[i : j]
return sum(K((x - x_i) / h) for x_i in supported) / (n * h)

pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}'
def cdf(x):
i = bisect_left(sample, x - bandwidth)
j = bisect_right(sample, x + bandwidth)
supported = sample[i : j]
return sum((I((x - x_i) / h) for x_i in supported), i) / n

return pdf
if cumulative:
cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}'
return cdf

else:
pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}'
return pdf


# Notes on methods for computing quantiles
Expand Down
16 changes: 15 additions & 1 deletion Lib/test/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2379,6 +2379,18 @@ def integrate(func, low, high, steps=10_000):
area = integrate(f_hat, -20, 20)
self.assertAlmostEqual(area, 1.0, places=4)

# Check CDF against an integral of the PDF

data = [3, 5, 10, 12]
h = 2.3
x = 10.5
for kernel in kernels:
with self.subTest(kernel=kernel):
cdf = kde(data, h, kernel, cumulative=True)
f_hat = kde(data, h, kernel)
area = integrate(f_hat, -20, x, 100_000)
self.assertAlmostEqual(cdf(x), area, places=4)

# Check error cases

with self.assertRaises(StatisticsError):
Expand All @@ -2395,6 +2407,8 @@ def integrate(func, low, high, steps=10_000):
kde(sample, h='str') # Wrong bandwidth type
with self.assertRaises(StatisticsError):
kde(sample, h=1.0, kernel='bogus') # Invalid kernel
with self.assertRaises(TypeError):
kde(sample, 1.0, 'gauss', True) # Positional cumulative argument

# Test name and docstring of the generated function

Expand All @@ -2403,7 +2417,7 @@ def integrate(func, low, high, steps=10_000):
f_hat = kde(sample, h, kernel)
self.assertEqual(f_hat.__name__, 'pdf')
self.assertIn(kernel, f_hat.__doc__)
self.assertIn(str(h), f_hat.__doc__)
self.assertIn(repr(h), f_hat.__doc__)

# Test closed interval for the support boundaries.
# In particular, 'uniform' should non-zero at the boundaries.
Expand Down

0 comments on commit efb1c33

Please sign in to comment.