File tree 2 files changed +55
-0
lines changed
2 files changed +55
-0
lines changed Original file line number Diff line number Diff line change
1
+ # Preprocessing
2
+ The preprocessing module implements common data preprocessing routines.
3
+
4
+ - ` nlp.py ` : Routines and objects for handling text data.
5
+ - n-gram generators
6
+ - Word and character tokenization
7
+ - Punctuation and stop-word removal
8
+ - Vocabulary / unigram count objects
9
+ - ` dsp.py ` : Routines for handling audio and image data.
10
+ - Signal windowing
11
+ - Signal autocorrelation
12
+ - Discrete Fourier transform
13
+ - Signal resampling via (bi-)linear interpolation and nearest neighbor
14
+
15
+ - ` general.py ` : General data preprocessing functions.
16
+ - Feature hashing ([ Moody, 1989] ( http://papers.nips.cc/paper/175-fast-learning-in-multi-resolution-hierarchies.pdf ) )
17
+ - Mini-batch generators
Original file line number Diff line number Diff line change 13
13
_SCIPY = False
14
14
15
15
16
+ def minibatch (X , batchsize = 256 , shuffle = True ):
17
+ """
18
+ Compute the minibatch indices for a training dataset.
19
+
20
+ Parameters
21
+ ----------
22
+ X : numpy array of shape (N, ...)
23
+ The dataset to divide into minibatches. Assumes the first dimension
24
+ represents the number of training examples.
25
+ batchsize : int (default: 256)
26
+ The desired size of each minibatch. Note, however, that if X.shape[0] %
27
+ batchsize > 0 then the final batch will contain fewer than batchsize
28
+ entries.
29
+ shuffle : bool (default: True)
30
+ Whether to shuffle the entries in the dataset before dividing into
31
+ minibatches
32
+
33
+ Returns
34
+ -------
35
+ mb_generator : generator
36
+ A generator which yields the indices into X for each batch
37
+ n_batches: int
38
+ The number of batches
39
+ """
40
+ N = X .shape [0 ]
41
+ ix = np .arange (N )
42
+ n_batches = int (np .ceil (N / batchsize ))
43
+
44
+ if shuffle :
45
+ np .random .shuffle (ix )
46
+
47
+ def mb_generator ():
48
+ for i in range (n_batches ):
49
+ yield ix [i * batchsize : (i + 1 ) * batchsize ]
50
+
51
+ return mb_generator (), n_batches
52
+
53
+
16
54
class FeatureHasher :
17
55
def __init__ (self , n_dim = 256 , sparse = True ):
18
56
"""
You can’t perform that action at this time.
0 commit comments