Skip to content

Commit a195ad1

Browse files
committed
add standardizer and onehot encoder
1 parent e23bd51 commit a195ad1

File tree

1 file changed

+212
-0
lines changed

1 file changed

+212
-0
lines changed

preprocessing/general.py

+212
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,218 @@ def mb_generator():
5151
return mb_generator(), n_batches
5252

5353

54+
class OneHotEncoder:
55+
def __init__(self):
56+
"""
57+
Convert between category labels and their one-hot vector
58+
representations.
59+
60+
Parameters
61+
----------
62+
categories : list of length C
63+
List of the unique category labels for the items to encode
64+
"""
65+
self._is_fit = False
66+
self.hyperparameters = {}
67+
self.parameters = {"categories": None}
68+
69+
def __call__(self, labels):
70+
return self.transform(labels)
71+
72+
def fit(self, categories):
73+
"""
74+
Create mappings between columns and category labels.
75+
76+
Parameters
77+
----------
78+
categories : list of length C
79+
List of the unique category labels for the items to encode
80+
"""
81+
self.parameters["categories"] = categories
82+
self.cat2idx = {c: i for i, c in enumerate(categories)}
83+
self.idx2cat = {i: c for i, c in enumerate(categories)}
84+
self._is_fit = True
85+
86+
def transform(self, labels, categories=None):
87+
"""
88+
Convert a list of labels into a one-hot encoding.
89+
90+
Parameters
91+
----------
92+
labels : list of length N
93+
A list of category labels
94+
categories : list of length C (default: None)
95+
List of the unique category labels for the items to encode
96+
97+
Returns
98+
-------
99+
Y : numpy array of shape (N, C)
100+
The one-hot encoded labels. Each row corresponds to an example,
101+
with a single 1 in the column corresponding to the respective label
102+
"""
103+
if not self._is_fit:
104+
categories = set(labels) if categories is None else categories
105+
self.fit(categories)
106+
107+
unknown = list(set(labels) - set(self.cat2idx.keys()))
108+
assert len(unknown) == 0, "Unrecognized label(s): {}".format(unknown)
109+
110+
N, C = len(labels), len(self.cat2idx)
111+
cols = np.array([self.cat2idx[c] for c in labels])
112+
113+
Y = np.zeros((N, C))
114+
Y[np.arange(N), cols] = 1
115+
return Y
116+
117+
def inverse_transform(self, Y):
118+
"""
119+
Convert a one-hot encoding back into the corresponding labels
120+
121+
Parameters
122+
----------
123+
Y : numpy array of shape (N, C)
124+
One-hot encoded labels. Each row corresponds to an example, with a
125+
single 1 in the column associated with the label for that example
126+
127+
Returns
128+
-------
129+
labels : list of length N
130+
The list of category labels corresponding to the nonzero columns in
131+
Y
132+
"""
133+
C = len(self.cat2idx)
134+
assert Y.ndim == 2, "Y must be 2D, but has shape {}".format(Y.shape)
135+
assert Y.shape[1] == C, "Y must have {} columns, got {}".format(C, Y.shape[1])
136+
return [self.idx2cat[ix] for ix in Y.nonzero()[1]]
137+
138+
139+
class Standardizer:
140+
def __init__(self, with_mean=True, with_std=True):
141+
"""
142+
Feature-wise standardization for vector inputs.
143+
144+
Due to the sensitivity of empirical mean and standard deviation
145+
calculations to extreme values, `Standardizer` cannot guarantee
146+
balanced feature scales in the presence of outliers. In particular,
147+
note that because outliers for each feature can have different
148+
magnitudes, the spread of the transformed data on each feature can be
149+
very different.
150+
151+
Similar to sklearn, `Standardizer` uses a biased estimator for the
152+
standard deviation: numpy.std(x, ddof=0).
153+
154+
Parameters
155+
----------
156+
with_mean : bool (default: True)
157+
Whether to scale samples to have 0 mean during transformation
158+
with_std : bool (default: True)
159+
Whether to scale samples to have unit variance during
160+
transformation
161+
"""
162+
self.with_mean = with_mean
163+
self.with_std = with_std
164+
self._is_fit = False
165+
166+
@property
167+
def hyperparameters(self):
168+
H = {"with_mean": self.with_mean, "with_std": self.with_std}
169+
return H
170+
171+
@property
172+
def parameters(self):
173+
params = {
174+
"mean": self._mean if hasattr(self, "mean") else None,
175+
"std": self._std if hasattr(self, "std") else None,
176+
}
177+
return params
178+
179+
def __call__(self, X):
180+
return self.transform(X)
181+
182+
def fit(self, X):
183+
"""
184+
Store the feature-wise mean and standard deviation across the samples
185+
in X for future scaling.
186+
187+
Parameters
188+
----------
189+
X : numpy array of shape (N, C)
190+
An array of N samples, each with dimensionality C
191+
"""
192+
if not isinstance(X, np.ndarray):
193+
X = np.array(X)
194+
195+
if X.shape[0] < 2:
196+
raise ValueError("`X` must contain at least 2 samples")
197+
198+
std = np.ones(X.shape[1])
199+
mean = np.zeros(X.shape[1])
200+
201+
if self.with_mean:
202+
mean = np.mean(X, axis=0)
203+
204+
if self.with_std:
205+
std = np.std(X, axis=0, ddof=0)
206+
207+
self._mean = mean
208+
self._std = std
209+
self._is_fit = True
210+
211+
def transform(self, X):
212+
"""
213+
Standardize features by removing the mean and scaling to unit variance.
214+
215+
For a sample `x`, the standardized score is calculated as:
216+
217+
z = (x - u) / s
218+
219+
where `u` is the mean of the training samples or zero if `with_mean` is
220+
False, and `s` is the standard deviation of the training samples or 1
221+
if `with_std` is False.
222+
223+
Parameters
224+
----------
225+
X : numpy array of shape (N, C)
226+
An array of N samples, each with dimensionality C
227+
228+
Returns
229+
-------
230+
Z : numpy array of shape (N, C)
231+
The feature-wise standardized version of X
232+
"""
233+
if not self._is_fit:
234+
raise Exception("Must call `fit` before using the `transform` method")
235+
return (X - self._mean) / self._std
236+
237+
def inverse_transform(self, Z):
238+
"""
239+
Convert a collection of standardized features back into the original
240+
feature space.
241+
242+
For a standardized sample `z`, the unstandardized score is calculated as:
243+
244+
x = z * s + u
245+
246+
where `u` is the mean of the training samples or zero if `with_mean` is
247+
False, and `s` is the standard deviation of the training samples or 1
248+
if `with_std` is False.
249+
250+
Parameters
251+
----------
252+
Z : numpy array of shape (N, C)
253+
An array of N standardized samples, each with dimensionality C
254+
255+
Returns
256+
-------
257+
X : numpy array of shape (N, C)
258+
The unstandardixed samples from Z
259+
"""
260+
assert self._is_fit, "Must fit `Standardizer` before calling inverse_transform"
261+
P = self.parameters
262+
mean, std = P["mean"], P["std"]
263+
return Z * std + mean
264+
265+
54266
class FeatureHasher:
55267
def __init__(self, n_dim=256, sparse=True):
56268
"""

0 commit comments

Comments
 (0)