15
15
from sklearn .exceptions import NotFittedError
16
16
from sklearn .impute import SimpleImputer
17
17
from sklearn .pipeline import make_pipeline
18
- from sklearn .preprocessing import OneHotEncoder , StandardScaler
18
+ from sklearn .preprocessing import OrdinalEncoder
19
19
20
20
from autoPyTorch .data .base_feature_validator import BaseFeatureValidator , SupportedFeatTypes
21
21
from autoPyTorch .data .utils import (
28
28
29
29
def _create_column_transformer (
30
30
preprocessors : Dict [str , List [BaseEstimator ]],
31
- numerical_columns : List [str ],
32
31
categorical_columns : List [str ],
33
32
) -> ColumnTransformer :
34
33
"""
@@ -39,49 +38,36 @@ def _create_column_transformer(
39
38
Args:
40
39
preprocessors (Dict[str, List[BaseEstimator]]):
41
40
Dictionary containing list of numerical and categorical preprocessors.
42
- numerical_columns (List[str]):
43
- List of names of numerical columns
44
41
categorical_columns (List[str]):
45
42
List of names of categorical columns
46
43
47
44
Returns:
48
45
ColumnTransformer
49
46
"""
50
47
51
- numerical_pipeline = 'drop'
52
- categorical_pipeline = 'drop'
53
- if len (numerical_columns ) > 0 :
54
- numerical_pipeline = make_pipeline (* preprocessors ['numerical' ])
55
- if len (categorical_columns ) > 0 :
56
- categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
48
+ categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
57
49
58
50
return ColumnTransformer ([
59
- ('categorical_pipeline' , categorical_pipeline , categorical_columns ),
60
- ('numerical_pipeline' , numerical_pipeline , numerical_columns )],
61
- remainder = 'drop'
51
+ ('categorical_pipeline' , categorical_pipeline , categorical_columns )],
52
+ remainder = 'passthrough'
62
53
)
63
54
64
55
65
56
def get_tabular_preprocessors () -> Dict [str , List [BaseEstimator ]]:
66
57
"""
67
58
This function creates a Dictionary containing a list
68
59
of numerical and categorical preprocessors
69
-
70
60
Returns:
71
61
Dict[str, List[BaseEstimator]]
72
62
"""
73
63
preprocessors : Dict [str , List [BaseEstimator ]] = dict ()
74
64
75
65
# Categorical Preprocessors
76
- onehot_encoder = OneHotEncoder (categories = 'auto' , sparse = False , handle_unknown = 'ignore' )
66
+ ordinal_encoder = OrdinalEncoder (handle_unknown = 'use_encoded_value' ,
67
+ unknown_value = - 1 )
77
68
categorical_imputer = SimpleImputer (strategy = 'constant' , copy = False )
78
69
79
- # Numerical Preprocessors
80
- numerical_imputer = SimpleImputer (strategy = 'median' , copy = False )
81
- standard_scaler = StandardScaler (with_mean = True , with_std = True , copy = False )
82
-
83
- preprocessors ['categorical' ] = [categorical_imputer , onehot_encoder ]
84
- preprocessors ['numerical' ] = [numerical_imputer , standard_scaler ]
70
+ preprocessors ['categorical' ] = [categorical_imputer , ordinal_encoder ]
85
71
86
72
return preprocessors
87
73
@@ -176,31 +162,47 @@ def _fit(
176
162
if hasattr (X , "iloc" ) and not issparse (X ):
177
163
X = cast (pd .DataFrame , X )
178
164
179
- self .all_nan_columns = set ([column for column in X .columns if X [column ].isna ().all ()])
165
+ all_nan_columns = X .columns [X .isna ().all ()]
166
+ for col in all_nan_columns :
167
+ X [col ] = pd .to_numeric (X [col ])
168
+
169
+ # Handle objects if possible
170
+ exist_object_columns = has_object_columns (X .dtypes .values )
171
+ if exist_object_columns :
172
+ X = self .infer_objects (X )
180
173
181
- categorical_columns , numerical_columns , feat_type = self ._get_columns_info (X )
174
+ self .dtypes = [dt .name for dt in X .dtypes ] # Also note this change in self.dtypes
175
+ self .all_nan_columns = set (all_nan_columns )
182
176
183
- self .enc_columns = categorical_columns
177
+ self .enc_columns , self . feat_type = self . _get_columns_info ( X )
184
178
185
- preprocessors = get_tabular_preprocessors ()
186
- self .column_transformer = _create_column_transformer (
187
- preprocessors = preprocessors ,
188
- numerical_columns = numerical_columns ,
189
- categorical_columns = categorical_columns ,
190
- )
179
+ if len (self .enc_columns ) > 0 :
191
180
192
- # Mypy redefinition
193
- assert self .column_transformer is not None
194
- self .column_transformer .fit (X )
181
+ preprocessors = get_tabular_preprocessors ()
182
+ self .column_transformer = _create_column_transformer (
183
+ preprocessors = preprocessors ,
184
+ categorical_columns = self .enc_columns ,
185
+ )
195
186
196
- # The column transformer reorders the feature types
197
- # therefore, we need to change the order of columns as well
198
- # This means categorical columns are shifted to the left
187
+ # Mypy redefinition
188
+ assert self . column_transformer is not None
189
+ self . column_transformer . fit ( X )
199
190
200
- self .feat_type = sorted (
201
- feat_type ,
202
- key = functools .cmp_to_key (self ._comparator )
203
- )
191
+ # The column transformer moves categorical columns before all numerical columns
192
+ # therefore, we need to sort categorical columns so that it complies this change
193
+
194
+ self .feat_type = sorted (
195
+ self .feat_type ,
196
+ key = functools .cmp_to_key (self ._comparator )
197
+ )
198
+
199
+ encoded_categories = self .column_transformer .\
200
+ named_transformers_ ['categorical_pipeline' ].\
201
+ named_steps ['ordinalencoder' ].categories_
202
+ self .categories = [
203
+ list (range (len (cat )))
204
+ for cat in encoded_categories
205
+ ]
204
206
205
207
# differently to categorical_columns and numerical_columns,
206
208
# this saves the index of the column.
@@ -280,6 +282,23 @@ def transform(
280
282
if hasattr (X , "iloc" ) and not scipy .sparse .issparse (X ):
281
283
X = cast (Type [pd .DataFrame ], X )
282
284
285
+ if self .all_nan_columns is None :
286
+ raise ValueError ('_fit must be called before calling transform' )
287
+
288
+ for col in list (self .all_nan_columns ):
289
+ X [col ] = np .nan
290
+ X [col ] = pd .to_numeric (X [col ])
291
+
292
+ if len (self .categorical_columns ) > 0 :
293
+ # when some categorical columns are not all nan in the training set
294
+ # but they are all nan in the testing or validation set
295
+ # we change those columns to `object` dtype
296
+ # to ensure that these columns are changed to appropriate dtype
297
+ # in self.infer_objects
298
+ all_nan_cat_cols = set (X [self .enc_columns ].columns [X [self .enc_columns ].isna ().all ()])
299
+ dtype_dict = {col : 'object' for col in self .enc_columns if col in all_nan_cat_cols }
300
+ X = X .astype (dtype_dict )
301
+
283
302
# Check the data here so we catch problems on new test data
284
303
self ._check_data (X )
285
304
@@ -288,11 +307,6 @@ def transform(
288
307
# We need to convert the column in test data to
289
308
# object otherwise the test column is interpreted as float
290
309
if self .column_transformer is not None :
291
- if len (self .categorical_columns ) > 0 :
292
- categorical_columns = self .column_transformer .transformers_ [0 ][- 1 ]
293
- for column in categorical_columns :
294
- if X [column ].isna ().all ():
295
- X [column ] = X [column ].astype ('object' )
296
310
X = self .column_transformer .transform (X )
297
311
298
312
# Sparse related transformations
@@ -407,7 +421,6 @@ def _check_data(
407
421
self .column_order = column_order
408
422
409
423
dtypes = [dtype .name for dtype in X .dtypes ]
410
-
411
424
diff_cols = X .columns [[s_dtype != dtype for s_dtype , dtype in zip (self .dtypes , dtypes )]]
412
425
if len (self .dtypes ) == 0 :
413
426
self .dtypes = dtypes
@@ -419,7 +432,7 @@ def _check_data(
419
432
def _get_columns_info (
420
433
self ,
421
434
X : pd .DataFrame ,
422
- ) -> Tuple [List [str ], List [str ], List [ str ] ]:
435
+ ) -> Tuple [List [str ], List [str ]]:
423
436
"""
424
437
Return the columns to be encoded from a pandas dataframe
425
438
@@ -438,15 +451,12 @@ def _get_columns_info(
438
451
"""
439
452
440
453
# Register if a column needs encoding
441
- numerical_columns = []
442
454
categorical_columns = []
443
455
# Also, register the feature types for the estimator
444
456
feat_type = []
445
457
446
458
# Make sure each column is a valid type
447
459
for i , column in enumerate (X .columns ):
448
- if self .all_nan_columns is not None and column in self .all_nan_columns :
449
- continue
450
460
column_dtype = self .dtypes [i ]
451
461
err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
452
462
"but input column {} has an invalid type `{}`." .format (column , column_dtype )
@@ -457,7 +467,6 @@ def _get_columns_info(
457
467
# TypeError: data type not understood in certain pandas types
458
468
elif is_numeric_dtype (column_dtype ):
459
469
feat_type .append ('numerical' )
460
- numerical_columns .append (column )
461
470
elif column_dtype == 'object' :
462
471
# TODO verify how would this happen when we always convert the object dtypes to category
463
472
raise TypeError (
@@ -483,7 +492,7 @@ def _get_columns_info(
483
492
"before feeding it to AutoPyTorch." .format (err_msg )
484
493
)
485
494
486
- return categorical_columns , numerical_columns , feat_type
495
+ return categorical_columns , feat_type
487
496
488
497
def list_to_pandas (
489
498
self ,
@@ -553,22 +562,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
553
562
pd.DataFrame
554
563
"""
555
564
if hasattr (self , 'object_dtype_mapping' ):
556
- # Mypy does not process the has attr. This dict is defined below
557
- for key , dtype in self . object_dtype_mapping . items (): # type: ignore[has-type]
558
- # honor the training data types
559
- try :
560
- X [ key ] = X [ key ]. astype ( dtype . name )
561
- except Exception as e :
562
- # Try inference if possible
563
- self . logger . warning ( f'Casting the column { key } to { dtype } caused the exception { e } ' )
564
- pass
565
+ # honor the training data types
566
+ try :
567
+ # Mypy does not process the has attr.
568
+ X = X . astype ( self . object_dtype_mapping ) # type: ignore[has-type]
569
+ except Exception as e :
570
+ # Try inference if possible
571
+ self . logger . warning ( f'Casting the columns to training dtypes ' # type: ignore[has-type]
572
+ f' { self . object_dtype_mapping } caused the exception { e } ' )
573
+ pass
565
574
else :
566
- # Calling for the first time to infer the categories
567
- X = X .infer_objects ()
568
- for column , data_type in zip (X .columns , X .dtypes ):
569
- if not is_numeric_dtype (data_type ):
570
- X [column ] = X [column ].astype ('category' )
571
-
575
+ if len (self .dtypes ) != 0 :
576
+ # when train data has no object dtype, but test does
577
+ # we prioritise the datatype given in training data
578
+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
579
+ X = X .astype (dtype_dict )
580
+ else :
581
+ # Calling for the first time to infer the categories
582
+ X = X .infer_objects ()
583
+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
584
+ X = X .astype (dtype_dict )
572
585
# only numerical attributes and categories
573
586
self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
574
587
0 commit comments