Closed
Description
Feature
Proposal:
I propose promoting the KDE statistics recipe to be an actual part of the statistics module.
See: https://docs.python.org/3.13/library/statistics.html#kernel-density-estimation
My thought Is that it is an essential part of statistical reasoning about sample data
See: https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf
Discussion of this feature:
This was discussed and reviewed with the module author who gave it his full support:
This looks really great, both as a concept and the code itself. It's a
good showcase for match, and an important statistics function.Thank you for working on this.
Working Draft:
from typing import Sequence, Callable
from bisect import bisect_left, bisect_right
from math import sqrt, exp, cos, pi, cosh
def kde(sample: Sequence[float],
h: float,
kernel_function: str='gauss',
) -> Callable[[float], float]:
"""Kernel Density Estimation. Creates a continuous probability
density function from discrete samples.
The basic idea is to smooth the data using a kernel function
to help draw inferences about a population from a sample.
The degree of smoothing is controlled by the scaling parameter h
which is called the bandwidth. Smaller values emphasize local
features while larger values give smoother results.
Kernel functions that give some weight to every sample point:
gauss or normal
logistic
sigmoid
Kernel functions that only give weight to sample points within
the bandwidth:
rectangular or uniform
triangular
epanechnikov or parabolic
biweight or quartic
triweight
cosine
Example
-------
Given a sample of six data points, estimate the
probablity density at evenly spaced points from -6 to 10:
>>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
>>> f_hat = kde(sample, h=1.5)
>>> total = 0.0
>>> for x in range(-6, 11):
... density = f_hat(x)
... total += density
... plot = ' ' * int(density * 400) + 'x'
... print(f'{x:2}: {density:.3f} {plot}')
...
-6: 0.002 x
-5: 0.009 x
-4: 0.031 x
-3: 0.070 x
-2: 0.111 x
-1: 0.125 x
0: 0.110 x
1: 0.086 x
2: 0.068 x
3: 0.059 x
4: 0.066 x
5: 0.082 x
6: 0.082 x
7: 0.058 x
8: 0.028 x
9: 0.009 x
10: 0.002 x
>>> round(total, 3)
0.999
References
----------
Kernel density estimation and its application:
https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf
Kernel functions in common use:
https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use
Interactive graphical demonstration and exploration:
https://demonstrations.wolfram.com/KernelDensityEstimation/
"""
kernel: Callable[[float], float]
support: float | None
match kernel_function:
case 'gauss' | 'normal':
c = 1 / sqrt(2 * pi)
kernel = lambda t: c * exp(-1/2 * t * t)
support = None
case 'logistic':
# 1.0 / (exp(t) + 2.0 + exp(-t))
kernel = lambda t: 1/2 / (1.0 + cosh(t))
support = None
case 'sigmoid':
# (2/pi) / (exp(t) + exp(-t))
c = 1 / pi
kernel = lambda t: c / cosh(t)
support = None
case 'rectangular' | 'uniform':
kernel = lambda t: 1/2
support = 1.0
case 'triangular':
kernel = lambda t: 1.0 - abs(t)
support = 1.0
case 'epanechnikov' | 'parabolic':
kernel = lambda t: 3/4 * (1.0 - t * t)
support = 1.0
case 'biweight' | 'quartic':
kernel = lambda t: 15/16 * (1.0 - t * t) ** 2
support = 1.0
case 'triweight':
kernel = lambda t: 35/32 * (1.0 - t * t) ** 3
support = 1.0
case 'cosine':
c1 = pi / 4
c2 = pi / 2
kernel = lambda t: c1 * cos(c2 * t)
support = 1.0
case _:
raise ValueError(f'Unknown kernel function: {kernel_function!r}')
n = len(sample)
if support is None:
def pdf(x: float) -> float:
return sum(kernel((x - x_i) / h) for x_i in sample) / (n * h)
else:
sample = sorted(sample)
bandwidth = h * support
def pdf(x: float) -> float:
i = bisect_left(sample, x - bandwidth)
j = bisect_right(sample, x + bandwidth)
supported = sample[i : j]
return sum(kernel((x - x_i) / h) for x_i in supported) / (n * h)
return pdf
def _test() -> None:
from statistics import NormalDist
def kde_normal(sample: Sequence[float], h: float) -> Callable[[float], float]:
"Create a continuous probability density function from a sample."
# Smooth the sample with a normal distribution kernel scaled by h.
kernel_h = NormalDist(0.0, h).pdf
n = len(sample)
def pdf(x: float) -> float:
return sum(kernel_h(x - x_i) for x_i in sample) / n
return pdf
D = NormalDist(250, 50)
data = D.samples(100)
h = 30
pd = D.pdf
fg = kde(data, h, 'gauss')
fg2 = kde_normal(data, h)
fr = kde(data, h, 'rectangular')
ft = kde(data, h, 'triangular')
fb = kde(data, h, 'biweight')
fe = kde(data, h, 'epanechnikov')
fc = kde(data, h, 'cosine')
fl = kde(data, h, 'logistic')
fs = kde(data, h, 'sigmoid')
def show(x: float) -> None:
for func in (pd, fg, fg2, fr, ft, fb, fe, fc, fl, fs):
print(func(x))
print()
show(197)
show(255)
show(320)
def integrate(func: Callable[[float], float],
low: float, high: float, steps: int=1000) -> float:
width = (high - low) / steps
xarr = (low + width * i for i in range(steps))
return sum(map(func, xarr)) * width
print('\nIntegrals:')
for func in (pd, fg, fg2, fr, ft, fb, fe, fc, fl, fs):
print(integrate(func, 0, 500, steps=10_000))
if __name__ == '__main__':
import doctest
_test()
print(doctest.testmod())