Skip to content

Commit e64a348

Browse files
author
Carlos Hernandez
committed
Doane's Rule
1 parent 92a648c commit e64a348

File tree

6 files changed

+40
-93
lines changed

6 files changed

+40
-93
lines changed

docs/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Binning Functions
1212
.. autosummary::
1313
:toctree: generated/
1414

15+
mdentropy.core.binning.doanes_rule
1516
mdentropy.core.binning.hist
1617
mdentropy.core.binning.symbolic
1718

mdentropy/core/binning.py

Lines changed: 35 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,32 @@
11
from ..utils import unique_row_count
22

3-
from itertools import product as iterproduct
3+
from numpy import (array, atleast_1d, digitize, empty, floor, linspace, log2,
4+
histogramdd, hstack, ndarray, sqrt, vstack)
5+
from scipy.stats import skew
46

5-
from numpy import (digitize, empty, linspace, histogramdd, hstack, product,
6-
vstack, zeros)
7+
__all__ = ['hist', 'symbolic', 'doanes_rule']
78

8-
from scipy.stats import binom_test
99

10-
__all__ = ['hist', 'symbolic', 'adaptive']
10+
def doanes_rule(x):
11+
"""Convenience function for choosing an optimal number of bins using Doane's Rule.
12+
13+
Parameters
14+
----------
15+
x : numpy.ndarray or list of floats
16+
Data to be binned.
17+
18+
Returns
19+
-------
20+
n_bins : int
21+
"""
22+
if not isinstance(x, ndarray):
23+
x = array(x)
24+
25+
n = x.shape[0]
26+
g1 = atleast_1d(skew(x))
27+
sg1 = sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
28+
29+
return min(floor(1 + log2(n) + log2(1 + abs(g1)/sg1)))
1130

1231

1332
def hist(n_bins, rng, *args):
@@ -27,6 +46,10 @@ def hist(n_bins, rng, *args):
2746
bins : array_like, shape = (n_bins, )
2847
"""
2948
data = vstack((args)).T
49+
50+
if n_bins is None:
51+
n_bins = doanes_rule(data)
52+
3053
return histogramdd(data, bins=n_bins, range=rng)[0].flatten()
3154

3255

@@ -47,88 +70,15 @@ def symbolic(n_bins, rng, *args):
4770
-------
4871
counts : float
4972
"""
50-
5173
labels = empty(0).reshape(args[0].shape[0], 0)
52-
for i, arg in enumerate(args):
53-
if n_bins is not None:
54-
partitions = linspace(rng[i][0], rng[i][1], n_bins + 1)
55-
label = digitize(arg, partitions).reshape(-1, 1)
56-
else:
57-
rng = tuple(rng)
58-
label = adaptive(arg)
59-
labels = hstack((labels, label))
60-
61-
return unique_row_count(labels)
74+
if n_bins is None:
75+
n_bins = min(map(doanes_rule, args))
6276

77+
for i, arg in enumerate(args):
6378

64-
def adaptive(*args, rng=None, alpha=0.05):
65-
"""Darbellay-Vajda adaptive partitioning (doi:10.1109/18.761290)
79+
partitions = linspace(rng[i][0], rng[i][1], n_bins + 1)
80+
label = digitize(arg, partitions).reshape(-1, 1)
6681

67-
Parameters
68-
----------
69-
args : array_like, shape = (n_samples, )
70-
Data of which to histogram.
71-
rng : list of lists
72-
List of min/max values to bin data over.
73-
alpha : float
74-
Chi-squared test criterion.
82+
labels = hstack((labels, label))
7583

76-
Returns
77-
-------
78-
bins : array_like, shape = (n_bins, )
79-
"""
80-
data = vstack(args).T
81-
82-
# Get number of dimensions
83-
n_dims = data.shape[1]
84-
dims = range(n_dims)
85-
86-
# If no ranges are supplied, initialize with min/max for each dimension
87-
if rng is None:
88-
rng = tuple((data[:, i].min(), data[:, i].max()) for i in dims)
89-
90-
if not (0. <= alpha < 1):
91-
raise ValueError('alpha must be a float in [0, 1).')
92-
93-
def dvpartition(data, rng):
94-
nonlocal n_dims
95-
nonlocal counts
96-
nonlocal labels
97-
nonlocal dims
98-
99-
# Filter out data that is not in our initial partition
100-
where = product([(i[0] <= data[:, j]) * (i[1] >= data[:, j])
101-
for j, i in enumerate(rng)], 0).astype(bool)
102-
filtered = data[where, :]
103-
104-
# Subdivide our partitions by the midpoint in each dimension
105-
partitions = set([])
106-
part = [linspace(rng[i][0], rng[i][1], 3) for i in dims]
107-
newrng = set((tuple((part[i][j[i]], part[i][j[i] + 1]) for i in dims)
108-
for j in iterproduct(*(n_dims * [[0, 1]]))),)
109-
110-
# Calculate counts for new partitions
111-
freq = histogramdd(filtered, bins=part)[0]
112-
113-
# Perform binomial test which a given alpha,
114-
# and if not uniform proceed
115-
if (binom_test(freq) < alpha / 2. and
116-
False not in ((filtered.max(0) - filtered.min(0)).T > 0)):
117-
118-
# For each new partition continue algorithm recursively
119-
for nr in newrng:
120-
newpart = dvpartition(data, rng=nr)
121-
for newp in newpart:
122-
partitions.update(tuple((newp,)))
123-
124-
# Else if uniform and contains data, return current partition
125-
elif filtered.shape[0] > 0:
126-
partitions = set(tuple((rng,)))
127-
labels[where] = len(counts)
128-
counts += (filtered.shape[0],)
129-
return partitions
130-
131-
counts = ()
132-
labels = zeros(data.shape[0], dtype=int)
133-
dvpartition(data, rng)
134-
return labels.reshape(-1, n_dims)
84+
return unique_row_count(labels)

mdentropy/core/information.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ def knn_mutinf(x, y, k=None, boxsize=None):
5656
-------
5757
mi : float
5858
"""
59-
6059
data = hstack((x, y))
6160

6261
k = k if k else max(3, int(data.shape[0] * 0.01))
@@ -180,6 +179,5 @@ def ncmutinf(n_bins, x, y, z, rng=None, method='knn'):
180179
-------
181180
ncmi : float
182181
"""
183-
184182
return (cmutinf(n_bins, x, y, z, rng=rng, method=method) /
185183
centropy(n_bins, x, z, rng=rng, method=method))

mdentropy/tests/test_cmutinf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_ncmutinf_grassberger():
4545
rtol=.2)
4646

4747

48-
def test_ncmutinf_freedman_diaconis():
48+
def test_ncmutinf_doanes_rule():
4949
close(ncmutinf(None, a, b, c, method='grassberger'), TRUE_NCMUTINF,
5050
atol=.05, rtol=.4)
5151

mdentropy/tests/test_entropy.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
import numpy as np
55
from numpy.testing import assert_allclose as close
6-
from unittest import skip
76

87
rs = np.random.RandomState(42)
98
n, d = 50000, 3
@@ -32,9 +31,8 @@ def test_entropy_grassberger():
3231
close(entropy(8, RNG, 'grassberger', X), TRUE_ENTROPY, rtol=.2)
3332

3433

35-
@skip('adaptive is still experimental')
36-
def test_entropy_adaptive():
37-
close(entropy(None, RNG, 'grassberger', X), TRUE_ENTROPY, rtol=.4)
34+
def test_entropy_doanes_rule():
35+
close(entropy(None, RNG, 'grassberger', X), TRUE_ENTROPY, atol=2., rtol=.2)
3836

3937

4038
def test_entropy_naive():

mdentropy/tests/test_mutinf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def test_mutinf_grassberger():
3939
rtol=.2)
4040

4141

42-
def test_mutinf_freedman_diaconis():
42+
def test_mutinf_doanes_rule():
4343
close(mutinf(None, X, Y, method='grassberger'), TRUE_MUTINF, atol=.01,
4444
rtol=.2)
4545

0 commit comments

Comments
 (0)