Doane's Rule

Carlos Hernandez · Carlos Hernandez · commit e64a3480e5fc · 2016-09-22T12:22:05.000-07:00
diff --git a/docs/api.rst b/docs/api.rst
@@ -12,6 +12,7 @@ Binning Functions
 .. autosummary::
     :toctree: generated/
 
+    mdentropy.core.binning.doanes_rule
     mdentropy.core.binning.hist
     mdentropy.core.binning.symbolic
 
diff --git a/mdentropy/core/binning.py b/mdentropy/core/binning.py
@@ -1,13 +1,32 @@
 from ..utils import unique_row_count
 
-from itertools import product as iterproduct
+from numpy import (array, atleast_1d, digitize, empty, floor, linspace, log2,
+                   histogramdd, hstack, ndarray, sqrt, vstack)
+from scipy.stats import skew
 
-from numpy import (digitize, empty, linspace, histogramdd, hstack, product,
-                   vstack, zeros)
+__all__ = ['hist', 'symbolic', 'doanes_rule']
 
-from scipy.stats import binom_test
 
-__all__ = ['hist', 'symbolic', 'adaptive']
+def doanes_rule(x):
+    """Convenience function for choosing an optimal number of bins using Doane's Rule.
+
+    Parameters
+    ----------
+    x : numpy.ndarray or list of floats
+        Data to be binned.
+
+    Returns
+    -------
+    n_bins : int
+    """
+    if not isinstance(x, ndarray):
+        x = array(x)
+
+    n = x.shape[0]
+    g1 = atleast_1d(skew(x))
+    sg1 = sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
+
+    return min(floor(1 + log2(n) + log2(1 + abs(g1)/sg1)))
 
 
 def hist(n_bins, rng, *args):
@@ -27,6 +46,10 @@ def hist(n_bins, rng, *args):
     bins : array_like, shape = (n_bins, )
     """
     data = vstack((args)).T
+
+    if n_bins is None:
+        n_bins = doanes_rule(data)
+
     return histogramdd(data, bins=n_bins, range=rng)[0].flatten()
 
 
@@ -47,88 +70,15 @@ def symbolic(n_bins, rng, *args):
     -------
     counts : float
     """
-
     labels = empty(0).reshape(args[0].shape[0], 0)
-    for i, arg in enumerate(args):
-        if n_bins is not None:
-            partitions = linspace(rng[i][0], rng[i][1], n_bins + 1)
-            label = digitize(arg, partitions).reshape(-1, 1)
-        else:
-            rng = tuple(rng)
-            label = adaptive(arg)
-        labels = hstack((labels, label))
-
-    return unique_row_count(labels)
+    if n_bins is None:
+        n_bins = min(map(doanes_rule, args))
 
+    for i, arg in enumerate(args):
 
-def adaptive(*args, rng=None, alpha=0.05):
-    """Darbellay-Vajda adaptive partitioning (doi:10.1109/18.761290)
+        partitions = linspace(rng[i][0], rng[i][1], n_bins + 1)
+        label = digitize(arg, partitions).reshape(-1, 1)
 
-        Parameters
-        ----------
-        args : array_like, shape = (n_samples, )
-            Data of which to histogram.
-        rng : list of lists
-            List of min/max values to bin data over.
-        alpha : float
-            Chi-squared test criterion.
+        labels = hstack((labels, label))
 
-        Returns
-        -------
-        bins : array_like, shape = (n_bins, )
-    """
-    data = vstack(args).T
-
-    # Get number of dimensions
-    n_dims = data.shape[1]
-    dims = range(n_dims)
-
-    # If no ranges are supplied, initialize with min/max for each dimension
-    if rng is None:
-        rng = tuple((data[:, i].min(), data[:, i].max()) for i in dims)
-
-    if not (0. <= alpha < 1):
-        raise ValueError('alpha must be a float in [0, 1).')
-
-    def dvpartition(data, rng):
-        nonlocal n_dims
-        nonlocal counts
-        nonlocal labels
-        nonlocal dims
-
-        # Filter out data that is not in our initial partition
-        where = product([(i[0] <= data[:, j]) * (i[1] >= data[:, j])
-                        for j, i in enumerate(rng)], 0).astype(bool)
-        filtered = data[where, :]
-
-        # Subdivide our partitions by the midpoint in each dimension
-        partitions = set([])
-        part = [linspace(rng[i][0], rng[i][1], 3) for i in dims]
-        newrng = set((tuple((part[i][j[i]], part[i][j[i] + 1]) for i in dims)
-                     for j in iterproduct(*(n_dims * [[0, 1]]))),)
-
-        # Calculate counts for new partitions
-        freq = histogramdd(filtered, bins=part)[0]
-
-        # Perform binomial test which a given alpha,
-        # and if not uniform proceed
-        if (binom_test(freq) < alpha / 2. and
-                False not in ((filtered.max(0) - filtered.min(0)).T > 0)):
-
-            # For each new partition continue algorithm recursively
-            for nr in newrng:
-                newpart = dvpartition(data, rng=nr)
-                for newp in newpart:
-                    partitions.update(tuple((newp,)))
-
-        # Else if uniform and contains data, return current partition
-        elif filtered.shape[0] > 0:
-            partitions = set(tuple((rng,)))
-            labels[where] = len(counts)
-            counts += (filtered.shape[0],)
-        return partitions
-
-    counts = ()
-    labels = zeros(data.shape[0], dtype=int)
-    dvpartition(data, rng)
-    return labels.reshape(-1, n_dims)
+    return unique_row_count(labels)
diff --git a/mdentropy/core/information.py b/mdentropy/core/information.py
@@ -56,7 +56,6 @@ def knn_mutinf(x, y, k=None, boxsize=None):
     -------
     mi : float
     """
-
     data = hstack((x, y))
 
     k = k if k else max(3, int(data.shape[0] * 0.01))
@@ -180,6 +179,5 @@ def ncmutinf(n_bins, x, y, z, rng=None, method='knn'):
     -------
     ncmi : float
     """
-
     return (cmutinf(n_bins, x, y, z, rng=rng, method=method) /
             centropy(n_bins, x, z, rng=rng, method=method))
diff --git a/mdentropy/tests/test_cmutinf.py b/mdentropy/tests/test_cmutinf.py
@@ -45,7 +45,7 @@ def test_ncmutinf_grassberger():
           rtol=.2)
 
 
-def test_ncmutinf_freedman_diaconis():
+def test_ncmutinf_doanes_rule():
     close(ncmutinf(None, a, b, c, method='grassberger'), TRUE_NCMUTINF,
           atol=.05, rtol=.4)
 
diff --git a/mdentropy/tests/test_entropy.py b/mdentropy/tests/test_entropy.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 from numpy.testing import assert_allclose as close
-from unittest import skip
 
 rs = np.random.RandomState(42)
 n, d = 50000, 3
@@ -32,9 +31,8 @@ def test_entropy_grassberger():
     close(entropy(8, RNG, 'grassberger', X), TRUE_ENTROPY, rtol=.2)
 
 
-@skip('adaptive is still experimental')
-def test_entropy_adaptive():
-    close(entropy(None, RNG, 'grassberger', X), TRUE_ENTROPY, rtol=.4)
+def test_entropy_doanes_rule():
+    close(entropy(None, RNG, 'grassberger', X), TRUE_ENTROPY, atol=2., rtol=.2)
 
 
 def test_entropy_naive():
diff --git a/mdentropy/tests/test_mutinf.py b/mdentropy/tests/test_mutinf.py
@@ -39,7 +39,7 @@ def test_mutinf_grassberger():
           rtol=.2)
 
 
-def test_mutinf_freedman_diaconis():
+def test_mutinf_doanes_rule():
     close(mutinf(None, X, Y, method='grassberger'), TRUE_MUTINF, atol=.01,
           rtol=.2)