From efb1c3326da4da1cac2dbbb6f23ddbd541aea765 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 24 Mar 2024 11:35:58 +0200 Subject: [PATCH] Add cumulative option for the new statistics.kde() function. (#117033) --- Doc/library/statistics.rst | 13 ++++--- Lib/statistics.py | 67 ++++++++++++++++++++++++++++--------- Lib/test/test_statistics.py | 16 ++++++++- 3 files changed, 75 insertions(+), 21 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 1785c6bcc212b74..79c681234545247 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -261,11 +261,12 @@ However, for reading convenience, most of the examples show sorted sequences. Added support for *weights*. -.. function:: kde(data, h, kernel='normal') +.. function:: kde(data, h, kernel='normal', *, cumulative=False) `Kernel Density Estimation (KDE) `_: - Create a continuous probability density function from discrete samples. + Create a continuous probability density function or cumulative + distribution function from discrete samples. The basic idea is to smooth the data using `a kernel function `_. @@ -280,11 +281,13 @@ However, for reading convenience, most of the examples show sorted sequences. as much as the more influential bandwidth smoothing parameter. Kernels that give some weight to every sample point include - *normal* or *gauss*, *logistic*, and *sigmoid*. + *normal* (*gauss*), *logistic*, and *sigmoid*. Kernels that only give weight to sample points within the bandwidth - include *rectangular* or *uniform*, *triangular*, *parabolic* or - *epanechnikov*, *quartic* or *biweight*, *triweight*, and *cosine*. + include *rectangular* (*uniform*), *triangular*, *parabolic* + (*epanechnikov*), *quartic* (*biweight*), *triweight*, and *cosine*. + + If *cumulative* is true, will return a cumulative distribution function. A :exc:`StatisticsError` will be raised if the *data* sequence is empty. diff --git a/Lib/statistics.py b/Lib/statistics.py index 5d636258fd442b2..58fb31def8896e3 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -138,7 +138,7 @@ from itertools import count, groupby, repeat from bisect import bisect_left, bisect_right from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum, sumprod -from math import isfinite, isinf, pi, cos, cosh +from math import isfinite, isinf, pi, cos, sin, cosh, atan from functools import reduce from operator import itemgetter from collections import Counter, namedtuple, defaultdict @@ -803,9 +803,9 @@ def multimode(data): return [value for value, count in counts.items() if count == maxcount] -def kde(data, h, kernel='normal'): - """Kernel Density Estimation: Create a continuous probability - density function from discrete samples. +def kde(data, h, kernel='normal', *, cumulative=False): + """Kernel Density Estimation: Create a continuous probability density + function or cumulative distribution function from discrete samples. The basic idea is to smooth the data using a kernel function to help draw inferences about a population from a sample. @@ -820,20 +820,22 @@ def kde(data, h, kernel='normal'): Kernels that give some weight to every sample point: - normal or gauss + normal (gauss) logistic sigmoid Kernels that only give weight to sample points within the bandwidth: - rectangular or uniform + rectangular (uniform) triangular - parabolic or epanechnikov - quartic or biweight + parabolic (epanechnikov) + quartic (biweight) triweight cosine + If *cumulative* is true, will return a cumulative distribution function. + A StatisticsError will be raised if the data sequence is empty. Example @@ -847,7 +849,8 @@ def kde(data, h, kernel='normal'): Compute the area under the curve: - >>> sum(f_hat(x) for x in range(-20, 20)) + >>> area = sum(f_hat(x) for x in range(-20, 20)) + >>> round(area, 4) 1.0 Plot the estimated probability density function at @@ -876,6 +879,13 @@ def kde(data, h, kernel='normal'): 9: 0.009 x 10: 0.002 x + Estimate P(4.5 < X <= 7.5), the probability that a new sample value + will be between 4.5 and 7.5: + + >>> cdf = kde(sample, h=1.5, cumulative=True) + >>> round(cdf(7.5) - cdf(4.5), 2) + 0.22 + References ---------- @@ -888,6 +898,9 @@ def kde(data, h, kernel='normal'): Interactive graphical demonstration and exploration: https://demonstrations.wolfram.com/KernelDensityEstimation/ + Kernel estimation of cumulative distribution function of a random variable with bounded support + https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf + """ n = len(data) @@ -903,45 +916,56 @@ def kde(data, h, kernel='normal'): match kernel: case 'normal' | 'gauss': - c = 1 / sqrt(2 * pi) - K = lambda t: c * exp(-1/2 * t * t) + sqrt2pi = sqrt(2 * pi) + sqrt2 = sqrt(2) + K = lambda t: exp(-1/2 * t * t) / sqrt2pi + I = lambda t: 1/2 * (1.0 + erf(t / sqrt2)) support = None case 'logistic': # 1.0 / (exp(t) + 2.0 + exp(-t)) K = lambda t: 1/2 / (1.0 + cosh(t)) + I = lambda t: 1.0 - 1.0 / (exp(t) + 1.0) support = None case 'sigmoid': # (2/pi) / (exp(t) + exp(-t)) - c = 1 / pi - K = lambda t: c / cosh(t) + c1 = 1 / pi + c2 = 2 / pi + K = lambda t: c1 / cosh(t) + I = lambda t: c2 * atan(exp(t)) support = None case 'rectangular' | 'uniform': K = lambda t: 1/2 + I = lambda t: 1/2 * t + 1/2 support = 1.0 case 'triangular': K = lambda t: 1.0 - abs(t) + I = lambda t: t*t * (1/2 if t < 0.0 else -1/2) + t + 1/2 support = 1.0 case 'parabolic' | 'epanechnikov': K = lambda t: 3/4 * (1.0 - t * t) + I = lambda t: -1/4 * t**3 + 3/4 * t + 1/2 support = 1.0 case 'quartic' | 'biweight': K = lambda t: 15/16 * (1.0 - t * t) ** 2 + I = lambda t: 3/16 * t**5 - 5/8 * t**3 + 15/16 * t + 1/2 support = 1.0 case 'triweight': K = lambda t: 35/32 * (1.0 - t * t) ** 3 + I = lambda t: 35/32 * (-1/7*t**7 + 3/5*t**5 - t**3 + t) + 1/2 support = 1.0 case 'cosine': c1 = pi / 4 c2 = pi / 2 K = lambda t: c1 * cos(c2 * t) + I = lambda t: 1/2 * sin(c2 * t) + 1/2 support = 1.0 case _: @@ -952,6 +976,9 @@ def kde(data, h, kernel='normal'): def pdf(x): return sum(K((x - x_i) / h) for x_i in data) / (n * h) + def cdf(x): + return sum(I((x - x_i) / h) for x_i in data) / n + else: sample = sorted(data) @@ -963,9 +990,19 @@ def pdf(x): supported = sample[i : j] return sum(K((x - x_i) / h) for x_i in supported) / (n * h) - pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}' + def cdf(x): + i = bisect_left(sample, x - bandwidth) + j = bisect_right(sample, x + bandwidth) + supported = sample[i : j] + return sum((I((x - x_i) / h) for x_i in supported), i) / n - return pdf + if cumulative: + cdf.__doc__ = f'CDF estimate with {h=!r} and {kernel=!r}' + return cdf + + else: + pdf.__doc__ = f'PDF estimate with {h=!r} and {kernel=!r}' + return pdf # Notes on methods for computing quantiles diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 1cf41638a7f01a3..204787a88a9c5fb 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2379,6 +2379,18 @@ def integrate(func, low, high, steps=10_000): area = integrate(f_hat, -20, 20) self.assertAlmostEqual(area, 1.0, places=4) + # Check CDF against an integral of the PDF + + data = [3, 5, 10, 12] + h = 2.3 + x = 10.5 + for kernel in kernels: + with self.subTest(kernel=kernel): + cdf = kde(data, h, kernel, cumulative=True) + f_hat = kde(data, h, kernel) + area = integrate(f_hat, -20, x, 100_000) + self.assertAlmostEqual(cdf(x), area, places=4) + # Check error cases with self.assertRaises(StatisticsError): @@ -2395,6 +2407,8 @@ def integrate(func, low, high, steps=10_000): kde(sample, h='str') # Wrong bandwidth type with self.assertRaises(StatisticsError): kde(sample, h=1.0, kernel='bogus') # Invalid kernel + with self.assertRaises(TypeError): + kde(sample, 1.0, 'gauss', True) # Positional cumulative argument # Test name and docstring of the generated function @@ -2403,7 +2417,7 @@ def integrate(func, low, high, steps=10_000): f_hat = kde(sample, h, kernel) self.assertEqual(f_hat.__name__, 'pdf') self.assertIn(kernel, f_hat.__doc__) - self.assertIn(str(h), f_hat.__doc__) + self.assertIn(repr(h), f_hat.__doc__) # Test closed interval for the support boundaries. # In particular, 'uniform' should non-zero at the boundaries.