Skip to content

Commit 2394600

Browse files
committed
[refactor] Separate some processes
1 parent c3e0fa0 commit 2394600

File tree

1 file changed

+90
-68
lines changed

1 file changed

+90
-68
lines changed

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 90 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,29 @@ def _convert_all_nan_columns_to_numeric(self, X: pd.DataFrame, fit: bool = False
189189

190190
return X
191191

192+
def _encode_categories(self, X: pd.DataFrame) -> None:
193+
preprocessors = get_tabular_preprocessors()
194+
self.column_transformer = _create_column_transformer(
195+
preprocessors=preprocessors,
196+
categorical_columns=self.enc_columns,
197+
)
198+
199+
assert self.column_transformer is not None # Mypy redefinition
200+
self.column_transformer.fit(X)
201+
202+
# The column transformer moves categoricals to the left side
203+
self.feat_type = sorted(self.feat_type, key=functools.cmp_to_key(self._comparator))
204+
205+
encoded_categories = self.column_transformer.\
206+
named_transformers_['categorical_pipeline'].\
207+
named_steps['ordinalencoder'].categories_
208+
209+
# An ordinal encoder for each categorical columns
210+
self.categories = [
211+
list(range(len(cat)))
212+
for cat in encoded_categories
213+
]
214+
192215
def _fit(self, X: SupportedFeatTypes) -> BaseEstimator:
193216
"""
194217
In case input data is a pandas DataFrame, this utility encodes the user provided
@@ -216,44 +239,15 @@ def _fit(self, X: SupportedFeatTypes) -> BaseEstimator:
216239
self.enc_columns, self.feat_type = self._get_columns_to_encode(X)
217240

218241
assert self.feat_type is not None
219-
220242
if len(self.enc_columns) > 0:
221-
222-
preprocessors = get_tabular_preprocessors()
223-
self.column_transformer = _create_column_transformer(
224-
preprocessors=preprocessors,
225-
categorical_columns=self.enc_columns,
226-
)
227-
228-
# Mypy redefinition
229-
assert self.column_transformer is not None
230-
self.column_transformer.fit(X)
231-
232-
# The column transformer reorders the feature types
233-
# therefore, we need to change the order of columns as well
234-
# This means categorical columns are shifted to the left
235-
self.feat_type = sorted(
236-
self.feat_type,
237-
key=functools.cmp_to_key(self._comparator)
238-
)
239-
240-
encoded_categories = self.column_transformer.\
241-
named_transformers_['categorical_pipeline'].\
242-
named_steps['ordinalencoder'].categories_
243-
self.categories = [
244-
# We fit an ordinal encoder, where all categorical
245-
# columns are shifted to the left
246-
list(range(len(cat)))
247-
for cat in encoded_categories
248-
]
243+
self._encode_categories(X)
249244

250245
for i, type_ in enumerate(self.feat_type):
251246
if 'numerical' in type_:
252247
self.numerical_columns.append(i)
253248
else:
254249
self.categorical_columns.append(i)
255250

256-
# Lastly, store the number of features
257251
self.num_features = np.shape(X)[1]
258252
return self
259253

@@ -270,6 +264,41 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
270264
Return:
271265
np.ndarray:
272266
The transformed array
267+
268+
Note:
269+
The default transform performs the folloing:
270+
* simple imputation for both
271+
* scaling for numerical
272+
* one-hot encoding for categorical
273+
For example, here is a simple case
274+
of which all the columns are categorical.
275+
data = [
276+
{'A': 1, 'B': np.nan, 'C': np.nan},
277+
{'A': np.nan, 'B': 3, 'C': np.nan},
278+
{'A': 2, 'B': np.nan, 'C': np.nan}
279+
]
280+
and suppose all the columns are categorical,
281+
then
282+
* `A` in {np.nan, 1, 2}
283+
* `B` in {np.nan, 3}
284+
* `C` in {np.nan} <=== it will be dropped.
285+
286+
So in the column A,
287+
* np.nan ==> [1, 0, 0] (always the index 0)
288+
* 1 ==> [0, 1, 0]
289+
* 2 ==> [0, 0, 1]
290+
in the column B,
291+
* np.nan ==> [1, 0]
292+
* 3 ==> [0, 1]
293+
Therefore, by concatenating,
294+
* {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
295+
* {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
296+
* {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
297+
==> [
298+
[0, 1, 0, 1, 0],
299+
[1, 0, 0, 0, 1],
300+
[0, 0, 1, 1, 0]
301+
]
273302
"""
274303
if not self._is_fitted:
275304
raise NotFittedError("Cannot call transform on a validator that is not fitted")
@@ -288,14 +317,6 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
288317

289318
# Pandas related transformations
290319
if ispandas(X) and self.column_transformer is not None:
291-
if np.any(pd.isnull(X)):
292-
# After above check it means that if there is a NaN
293-
# the whole column must be NaN
294-
# Make sure it is numerical and let the pipeline handle it
295-
for column in X.columns:
296-
if X[column].isna().all():
297-
X[column] = pd.to_numeric(X[column])
298-
299320
X = self.column_transformer.transform(X)
300321

301322
# Sparse related transformations
@@ -304,17 +325,15 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
304325
X.sort_indices()
305326

306327
try:
307-
X = sklearn.utils.check_array(
308-
X,
309-
force_all_finite=False,
310-
accept_sparse='csr'
311-
)
328+
X = sklearn.utils.check_array(X, force_all_finite=False, accept_sparse='csr')
312329
except Exception as e:
313-
self.logger.exception(f"Conversion failed for input {X.dtypes} {X}"
314-
"This means AutoPyTorch was not able to properly "
315-
"Extract the dtypes of the provided input features. "
316-
"Please try to manually cast it to a supported "
317-
"numerical or categorical values.")
330+
self.logger.exception(
331+
f"Conversion failed for input {X.dtypes} {X}"
332+
"This means AutoPyTorch was not able to properly "
333+
"Extract the dtypes of the provided input features. "
334+
"Please try to manually cast it to a supported "
335+
"numerical or categorical values."
336+
)
318337
raise e
319338

320339
X = self._compress_dataset(X)
@@ -328,7 +347,6 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio
328347
the testing data is converted to the same dtype as
329348
the training data.
330349
331-
332350
Args:
333351
X (DatasetCompressionInputType):
334352
Dataset
@@ -510,27 +528,31 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
510528
pd.DataFrame
511529
"""
512530
if hasattr(self, 'object_dtype_mapping'):
513-
# Mypy does not process the has attr. This dict is defined below
514-
for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type]
515-
if 'int' in dtype.name:
516-
# In the case train data was interpreted as int
517-
# and test data was interpreted as float, because of 0.0
518-
# for example, honor training data
519-
X[key] = X[key].applymap(np.int64)
520-
else:
521-
try:
522-
X[key] = X[key].astype(dtype.name)
523-
except Exception as e:
524-
# Try inference if possible
525-
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
526-
pass
531+
# honor the training data types
532+
try:
533+
# Mypy does not process the has attr.
534+
X = X.astype(self.object_dtype_mapping) # type: ignore[has-type]
535+
except Exception as e:
536+
# Try inference if possible
537+
self.logger.warning(f'Casting the columns to training dtypes ' # type: ignore[has-type]
538+
f'{self.object_dtype_mapping} caused the exception {e}')
539+
pass
527540
else:
528-
X = X.infer_objects()
529-
for column in X.columns:
530-
if not is_numeric_dtype(X[column]):
531-
X[column] = X[column].astype('category')
532-
self.object_dtype_mapping = {column: X[column].dtype for column in X.columns}
541+
if len(self.dtypes) != 0:
542+
# when train data has no object dtype, but test does
543+
# we prioritise the datatype given in training data
544+
dtype_dict = {col: dtype for col, dtype in zip(X.columns, self.dtypes)}
545+
X = X.astype(dtype_dict)
546+
else:
547+
# Calling for the first time to infer the categories
548+
X = X.infer_objects()
549+
dtype_dict = {col: 'category' for col, dtype in zip(X.columns, X.dtypes) if not is_numeric_dtype(dtype)}
550+
X = X.astype(dtype_dict)
551+
# only numerical attributes and categories
552+
self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
553+
533554
self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
555+
534556
return X
535557

536558

0 commit comments

Comments
 (0)