@@ -51,6 +51,218 @@ def mb_generator():
51
51
return mb_generator (), n_batches
52
52
53
53
54
+ class OneHotEncoder :
55
+ def __init__ (self ):
56
+ """
57
+ Convert between category labels and their one-hot vector
58
+ representations.
59
+
60
+ Parameters
61
+ ----------
62
+ categories : list of length C
63
+ List of the unique category labels for the items to encode
64
+ """
65
+ self ._is_fit = False
66
+ self .hyperparameters = {}
67
+ self .parameters = {"categories" : None }
68
+
69
+ def __call__ (self , labels ):
70
+ return self .transform (labels )
71
+
72
+ def fit (self , categories ):
73
+ """
74
+ Create mappings between columns and category labels.
75
+
76
+ Parameters
77
+ ----------
78
+ categories : list of length C
79
+ List of the unique category labels for the items to encode
80
+ """
81
+ self .parameters ["categories" ] = categories
82
+ self .cat2idx = {c : i for i , c in enumerate (categories )}
83
+ self .idx2cat = {i : c for i , c in enumerate (categories )}
84
+ self ._is_fit = True
85
+
86
+ def transform (self , labels , categories = None ):
87
+ """
88
+ Convert a list of labels into a one-hot encoding.
89
+
90
+ Parameters
91
+ ----------
92
+ labels : list of length N
93
+ A list of category labels
94
+ categories : list of length C (default: None)
95
+ List of the unique category labels for the items to encode
96
+
97
+ Returns
98
+ -------
99
+ Y : numpy array of shape (N, C)
100
+ The one-hot encoded labels. Each row corresponds to an example,
101
+ with a single 1 in the column corresponding to the respective label
102
+ """
103
+ if not self ._is_fit :
104
+ categories = set (labels ) if categories is None else categories
105
+ self .fit (categories )
106
+
107
+ unknown = list (set (labels ) - set (self .cat2idx .keys ()))
108
+ assert len (unknown ) == 0 , "Unrecognized label(s): {}" .format (unknown )
109
+
110
+ N , C = len (labels ), len (self .cat2idx )
111
+ cols = np .array ([self .cat2idx [c ] for c in labels ])
112
+
113
+ Y = np .zeros ((N , C ))
114
+ Y [np .arange (N ), cols ] = 1
115
+ return Y
116
+
117
+ def inverse_transform (self , Y ):
118
+ """
119
+ Convert a one-hot encoding back into the corresponding labels
120
+
121
+ Parameters
122
+ ----------
123
+ Y : numpy array of shape (N, C)
124
+ One-hot encoded labels. Each row corresponds to an example, with a
125
+ single 1 in the column associated with the label for that example
126
+
127
+ Returns
128
+ -------
129
+ labels : list of length N
130
+ The list of category labels corresponding to the nonzero columns in
131
+ Y
132
+ """
133
+ C = len (self .cat2idx )
134
+ assert Y .ndim == 2 , "Y must be 2D, but has shape {}" .format (Y .shape )
135
+ assert Y .shape [1 ] == C , "Y must have {} columns, got {}" .format (C , Y .shape [1 ])
136
+ return [self .idx2cat [ix ] for ix in Y .nonzero ()[1 ]]
137
+
138
+
139
+ class Standardizer :
140
+ def __init__ (self , with_mean = True , with_std = True ):
141
+ """
142
+ Feature-wise standardization for vector inputs.
143
+
144
+ Due to the sensitivity of empirical mean and standard deviation
145
+ calculations to extreme values, `Standardizer` cannot guarantee
146
+ balanced feature scales in the presence of outliers. In particular,
147
+ note that because outliers for each feature can have different
148
+ magnitudes, the spread of the transformed data on each feature can be
149
+ very different.
150
+
151
+ Similar to sklearn, `Standardizer` uses a biased estimator for the
152
+ standard deviation: numpy.std(x, ddof=0).
153
+
154
+ Parameters
155
+ ----------
156
+ with_mean : bool (default: True)
157
+ Whether to scale samples to have 0 mean during transformation
158
+ with_std : bool (default: True)
159
+ Whether to scale samples to have unit variance during
160
+ transformation
161
+ """
162
+ self .with_mean = with_mean
163
+ self .with_std = with_std
164
+ self ._is_fit = False
165
+
166
+ @property
167
+ def hyperparameters (self ):
168
+ H = {"with_mean" : self .with_mean , "with_std" : self .with_std }
169
+ return H
170
+
171
+ @property
172
+ def parameters (self ):
173
+ params = {
174
+ "mean" : self ._mean if hasattr (self , "mean" ) else None ,
175
+ "std" : self ._std if hasattr (self , "std" ) else None ,
176
+ }
177
+ return params
178
+
179
+ def __call__ (self , X ):
180
+ return self .transform (X )
181
+
182
+ def fit (self , X ):
183
+ """
184
+ Store the feature-wise mean and standard deviation across the samples
185
+ in X for future scaling.
186
+
187
+ Parameters
188
+ ----------
189
+ X : numpy array of shape (N, C)
190
+ An array of N samples, each with dimensionality C
191
+ """
192
+ if not isinstance (X , np .ndarray ):
193
+ X = np .array (X )
194
+
195
+ if X .shape [0 ] < 2 :
196
+ raise ValueError ("`X` must contain at least 2 samples" )
197
+
198
+ std = np .ones (X .shape [1 ])
199
+ mean = np .zeros (X .shape [1 ])
200
+
201
+ if self .with_mean :
202
+ mean = np .mean (X , axis = 0 )
203
+
204
+ if self .with_std :
205
+ std = np .std (X , axis = 0 , ddof = 0 )
206
+
207
+ self ._mean = mean
208
+ self ._std = std
209
+ self ._is_fit = True
210
+
211
+ def transform (self , X ):
212
+ """
213
+ Standardize features by removing the mean and scaling to unit variance.
214
+
215
+ For a sample `x`, the standardized score is calculated as:
216
+
217
+ z = (x - u) / s
218
+
219
+ where `u` is the mean of the training samples or zero if `with_mean` is
220
+ False, and `s` is the standard deviation of the training samples or 1
221
+ if `with_std` is False.
222
+
223
+ Parameters
224
+ ----------
225
+ X : numpy array of shape (N, C)
226
+ An array of N samples, each with dimensionality C
227
+
228
+ Returns
229
+ -------
230
+ Z : numpy array of shape (N, C)
231
+ The feature-wise standardized version of X
232
+ """
233
+ if not self ._is_fit :
234
+ raise Exception ("Must call `fit` before using the `transform` method" )
235
+ return (X - self ._mean ) / self ._std
236
+
237
+ def inverse_transform (self , Z ):
238
+ """
239
+ Convert a collection of standardized features back into the original
240
+ feature space.
241
+
242
+ For a standardized sample `z`, the unstandardized score is calculated as:
243
+
244
+ x = z * s + u
245
+
246
+ where `u` is the mean of the training samples or zero if `with_mean` is
247
+ False, and `s` is the standard deviation of the training samples or 1
248
+ if `with_std` is False.
249
+
250
+ Parameters
251
+ ----------
252
+ Z : numpy array of shape (N, C)
253
+ An array of N standardized samples, each with dimensionality C
254
+
255
+ Returns
256
+ -------
257
+ X : numpy array of shape (N, C)
258
+ The unstandardixed samples from Z
259
+ """
260
+ assert self ._is_fit , "Must fit `Standardizer` before calling inverse_transform"
261
+ P = self .parameters
262
+ mean , std = P ["mean" ], P ["std" ]
263
+ return Z * std + mean
264
+
265
+
54
266
class FeatureHasher :
55
267
def __init__ (self , n_dim = 256 , sparse = True ):
56
268
"""
0 commit comments