1
1
from ..utils import unique_row_count
2
2
3
- from itertools import product as iterproduct
3
+ from numpy import (array , atleast_1d , digitize , empty , floor , linspace , log2 ,
4
+ histogramdd , hstack , ndarray , sqrt , vstack )
5
+ from scipy .stats import skew
4
6
5
- from numpy import (digitize , empty , linspace , histogramdd , hstack , product ,
6
- vstack , zeros )
7
+ __all__ = ['hist' , 'symbolic' , 'doanes_rule' ]
7
8
8
- from scipy .stats import binom_test
9
9
10
- __all__ = ['hist' , 'symbolic' , 'adaptive' ]
10
+ def doanes_rule (x ):
11
+ """Convenience function for choosing an optimal number of bins using Doane's Rule.
12
+
13
+ Parameters
14
+ ----------
15
+ x : numpy.ndarray or list of floats
16
+ Data to be binned.
17
+
18
+ Returns
19
+ -------
20
+ n_bins : int
21
+ """
22
+ if not isinstance (x , ndarray ):
23
+ x = array (x )
24
+
25
+ n = x .shape [0 ]
26
+ g1 = atleast_1d (skew (x ))
27
+ sg1 = sqrt (6 * (n - 2 ) / ((n + 1 ) * (n + 3 )))
28
+
29
+ return min (floor (1 + log2 (n ) + log2 (1 + abs (g1 )/ sg1 )))
11
30
12
31
13
32
def hist (n_bins , rng , * args ):
@@ -27,6 +46,10 @@ def hist(n_bins, rng, *args):
27
46
bins : array_like, shape = (n_bins, )
28
47
"""
29
48
data = vstack ((args )).T
49
+
50
+ if n_bins is None :
51
+ n_bins = doanes_rule (data )
52
+
30
53
return histogramdd (data , bins = n_bins , range = rng )[0 ].flatten ()
31
54
32
55
@@ -47,88 +70,15 @@ def symbolic(n_bins, rng, *args):
47
70
-------
48
71
counts : float
49
72
"""
50
-
51
73
labels = empty (0 ).reshape (args [0 ].shape [0 ], 0 )
52
- for i , arg in enumerate (args ):
53
- if n_bins is not None :
54
- partitions = linspace (rng [i ][0 ], rng [i ][1 ], n_bins + 1 )
55
- label = digitize (arg , partitions ).reshape (- 1 , 1 )
56
- else :
57
- rng = tuple (rng )
58
- label = adaptive (arg )
59
- labels = hstack ((labels , label ))
60
-
61
- return unique_row_count (labels )
74
+ if n_bins is None :
75
+ n_bins = min (map (doanes_rule , args ))
62
76
77
+ for i , arg in enumerate (args ):
63
78
64
- def adaptive ( * args , rng = None , alpha = 0.05 ):
65
- """Darbellay-Vajda adaptive partitioning (doi:10.1109/18.761290 )
79
+ partitions = linspace ( rng [ i ][ 0 ] , rng [ i ][ 1 ], n_bins + 1 )
80
+ label = digitize ( arg , partitions ). reshape ( - 1 , 1 )
66
81
67
- Parameters
68
- ----------
69
- args : array_like, shape = (n_samples, )
70
- Data of which to histogram.
71
- rng : list of lists
72
- List of min/max values to bin data over.
73
- alpha : float
74
- Chi-squared test criterion.
82
+ labels = hstack ((labels , label ))
75
83
76
- Returns
77
- -------
78
- bins : array_like, shape = (n_bins, )
79
- """
80
- data = vstack (args ).T
81
-
82
- # Get number of dimensions
83
- n_dims = data .shape [1 ]
84
- dims = range (n_dims )
85
-
86
- # If no ranges are supplied, initialize with min/max for each dimension
87
- if rng is None :
88
- rng = tuple ((data [:, i ].min (), data [:, i ].max ()) for i in dims )
89
-
90
- if not (0. <= alpha < 1 ):
91
- raise ValueError ('alpha must be a float in [0, 1).' )
92
-
93
- def dvpartition (data , rng ):
94
- nonlocal n_dims
95
- nonlocal counts
96
- nonlocal labels
97
- nonlocal dims
98
-
99
- # Filter out data that is not in our initial partition
100
- where = product ([(i [0 ] <= data [:, j ]) * (i [1 ] >= data [:, j ])
101
- for j , i in enumerate (rng )], 0 ).astype (bool )
102
- filtered = data [where , :]
103
-
104
- # Subdivide our partitions by the midpoint in each dimension
105
- partitions = set ([])
106
- part = [linspace (rng [i ][0 ], rng [i ][1 ], 3 ) for i in dims ]
107
- newrng = set ((tuple ((part [i ][j [i ]], part [i ][j [i ] + 1 ]) for i in dims )
108
- for j in iterproduct (* (n_dims * [[0 , 1 ]]))),)
109
-
110
- # Calculate counts for new partitions
111
- freq = histogramdd (filtered , bins = part )[0 ]
112
-
113
- # Perform binomial test which a given alpha,
114
- # and if not uniform proceed
115
- if (binom_test (freq ) < alpha / 2. and
116
- False not in ((filtered .max (0 ) - filtered .min (0 )).T > 0 )):
117
-
118
- # For each new partition continue algorithm recursively
119
- for nr in newrng :
120
- newpart = dvpartition (data , rng = nr )
121
- for newp in newpart :
122
- partitions .update (tuple ((newp ,)))
123
-
124
- # Else if uniform and contains data, return current partition
125
- elif filtered .shape [0 ] > 0 :
126
- partitions = set (tuple ((rng ,)))
127
- labels [where ] = len (counts )
128
- counts += (filtered .shape [0 ],)
129
- return partitions
130
-
131
- counts = ()
132
- labels = zeros (data .shape [0 ], dtype = int )
133
- dvpartition (data , rng )
134
- return labels .reshape (- 1 , n_dims )
84
+ return unique_row_count (labels )
0 commit comments