Skip to content

Add kernel density estimation to the statistics module #115532

Closed
@rhettinger

Description

@rhettinger

Feature

Proposal:

I propose promoting the KDE statistics recipe to be an actual part of the statistics module.
See: https://docs.python.org/3.13/library/statistics.html#kernel-density-estimation

My thought Is that it is an essential part of statistical reasoning about sample data
See: https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf

Discussion of this feature:

This was discussed and reviewed with the module author who gave it his full support:

This looks really great, both as a concept and the code itself. It's a
good showcase for match, and an important statistics function.

Thank you for working on this.

Working Draft:

from typing import Sequence, Callable
from bisect import bisect_left, bisect_right
from math import sqrt, exp, cos, pi, cosh

def kde(sample: Sequence[float],
        h: float,
        kernel_function: str='gauss',
    ) -> Callable[[float], float]:
    """Kernel Density Estimation.  Creates a continuous probability
    density function from discrete samples.

    The basic idea is to smooth the data using a kernel function
    to help draw inferences about a population from a sample.

    The degree of smoothing is controlled by the scaling parameter h
    which is called the bandwidth.  Smaller values emphasize local
    features while larger values give smoother results.

    Kernel functions that give some weight to every sample point:

       gauss or normal
       logistic
       sigmoid

    Kernel functions that only give weight to sample points within
    the bandwidth:

       rectangular or uniform
       triangular
       epanechnikov or parabolic
       biweight or quartic
       triweight
       cosine

    Example
    -------

    Given a sample of six data points, estimate the
    probablity density at evenly spaced points from -6 to 10:

        >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
        >>> f_hat = kde(sample, h=1.5)
        >>> total = 0.0
        >>> for x in range(-6, 11):
        ...     density = f_hat(x)
        ...     total += density
        ...     plot = ' ' * int(density * 400) + 'x'
        ...     print(f'{x:2}: {density:.3f} {plot}')
        ...
        -6: 0.002 x
        -5: 0.009    x
        -4: 0.031             x
        -3: 0.070                             x
        -2: 0.111                                             x
        -1: 0.125                                                   x
         0: 0.110                                            x
         1: 0.086                                   x
         2: 0.068                            x
         3: 0.059                        x
         4: 0.066                           x
         5: 0.082                                 x
         6: 0.082                                 x
         7: 0.058                        x
         8: 0.028            x
         9: 0.009    x
        10: 0.002 x
        >>> round(total, 3)
        0.999

    References
    ----------

    Kernel density estimation and its application:
    https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf

    Kernel functions in common use:
    https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use

    Interactive graphical demonstration and exploration:
    https://demonstrations.wolfram.com/KernelDensityEstimation/

    """

    kernel: Callable[[float], float]
    support: float | None

    match kernel_function:

        case 'gauss' | 'normal':
            c = 1 / sqrt(2 * pi)
            kernel = lambda t: c * exp(-1/2 * t * t)
            support = None

        case 'logistic':
            # 1.0 / (exp(t) + 2.0 + exp(-t))
            kernel = lambda t: 1/2 / (1.0 + cosh(t))
            support = None

        case 'sigmoid':
            # (2/pi) / (exp(t) + exp(-t))
            c = 1 / pi
            kernel = lambda t: c / cosh(t)
            support = None

        case 'rectangular' | 'uniform':
            kernel = lambda t: 1/2
            support = 1.0

        case 'triangular':
            kernel = lambda t: 1.0 - abs(t)
            support = 1.0

        case 'epanechnikov' | 'parabolic':
            kernel = lambda t: 3/4 * (1.0 - t * t)
            support = 1.0

        case 'biweight' | 'quartic':
            kernel = lambda t: 15/16 * (1.0 - t * t) ** 2
            support = 1.0

        case 'triweight':
            kernel = lambda t: 35/32 * (1.0 - t * t) ** 3
            support = 1.0

        case 'cosine':
            c1 = pi / 4
            c2 = pi / 2
            kernel = lambda t: c1 * cos(c2 * t)
            support = 1.0

        case _:
            raise ValueError(f'Unknown kernel function: {kernel_function!r}')

    n = len(sample)

    if support is None:

        def pdf(x: float) -> float:
            return sum(kernel((x - x_i) / h) for x_i in sample) / (n * h)

    else:

        sample = sorted(sample)
        bandwidth = h * support

        def pdf(x: float) -> float:
            i = bisect_left(sample, x - bandwidth)
            j = bisect_right(sample, x + bandwidth)
            supported = sample[i : j]
            return sum(kernel((x - x_i) / h) for x_i in supported) / (n * h)

    return pdf


def _test() -> None:
    from statistics import NormalDist

    def kde_normal(sample: Sequence[float], h: float) -> Callable[[float], float]:
        "Create a continuous probability density function from a sample."
        # Smooth the sample with a normal distribution kernel scaled by h.
        kernel_h = NormalDist(0.0, h).pdf
        n = len(sample)
        def pdf(x: float) -> float:
            return sum(kernel_h(x - x_i) for x_i in sample) / n
        return pdf

    D = NormalDist(250, 50)
    data = D.samples(100)
    h = 30
    pd = D.pdf

    fg = kde(data, h, 'gauss')
    fg2 = kde_normal(data, h)
    fr = kde(data, h, 'rectangular')
    ft = kde(data, h, 'triangular')
    fb = kde(data, h, 'biweight')
    fe = kde(data, h, 'epanechnikov')
    fc = kde(data, h, 'cosine')
    fl = kde(data, h, 'logistic')
    fs = kde(data, h, 'sigmoid')

    def show(x: float) -> None:
        for func in (pd, fg, fg2, fr, ft, fb, fe, fc, fl, fs):
            print(func(x))
        print()

    show(197)
    show(255)
    show(320)

    def integrate(func: Callable[[float], float],
                  low: float, high: float, steps: int=1000) -> float:
        width = (high - low) / steps
        xarr = (low + width * i for i in range(steps))
        return sum(map(func, xarr)) * width

    print('\nIntegrals:')
    for func in (pd, fg, fg2, fr, ft, fb, fe, fc, fl, fs):
        print(integrate(func, 0, 500, steps=10_000))


if __name__ == '__main__':


    import doctest

    _test()
    print(doctest.testmod())

Linked PRs

Metadata

Metadata

Assignees

No one assigned

    Labels

    3.13bugs and security fixestype-featureA feature request or enhancement

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions