16
16
from sklearn .exceptions import NotFittedError
17
17
from sklearn .impute import SimpleImputer
18
18
from sklearn .pipeline import make_pipeline
19
- from sklearn .preprocessing import OneHotEncoder , StandardScaler
19
+ from sklearn .preprocessing import OrdinalEncoder
20
20
21
21
from autoPyTorch .data .base_feature_validator import BaseFeatureValidator , SupportedFeatTypes
22
22
from autoPyTorch .utils .common import ispandas
25
25
26
26
def _create_column_transformer (
27
27
preprocessors : Dict [str , List [BaseEstimator ]],
28
- numerical_columns : List [str ],
29
28
categorical_columns : List [str ],
30
29
) -> ColumnTransformer :
31
30
"""
@@ -36,49 +35,36 @@ def _create_column_transformer(
36
35
Args:
37
36
preprocessors (Dict[str, List[BaseEstimator]]):
38
37
Dictionary containing list of numerical and categorical preprocessors.
39
- numerical_columns (List[str]):
40
- List of names of numerical columns
41
38
categorical_columns (List[str]):
42
39
List of names of categorical columns
43
40
44
41
Returns:
45
42
ColumnTransformer
46
43
"""
47
44
48
- numerical_pipeline = 'drop'
49
- categorical_pipeline = 'drop'
50
- if len (numerical_columns ) > 0 :
51
- numerical_pipeline = make_pipeline (* preprocessors ['numerical' ])
52
- if len (categorical_columns ) > 0 :
53
- categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
45
+ categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
54
46
55
47
return ColumnTransformer ([
56
- ('categorical_pipeline' , categorical_pipeline , categorical_columns ),
57
- ('numerical_pipeline' , numerical_pipeline , numerical_columns )],
58
- remainder = 'drop'
48
+ ('categorical_pipeline' , categorical_pipeline , categorical_columns )],
49
+ remainder = 'passthrough'
59
50
)
60
51
61
52
62
53
def get_tabular_preprocessors () -> Dict [str , List [BaseEstimator ]]:
63
54
"""
64
55
This function creates a Dictionary containing a list
65
56
of numerical and categorical preprocessors
66
-
67
57
Returns:
68
58
Dict[str, List[BaseEstimator]]
69
59
"""
70
60
preprocessors : Dict [str , List [BaseEstimator ]] = dict ()
71
61
72
62
# Categorical Preprocessors
73
- onehot_encoder = OneHotEncoder (categories = 'auto' , sparse = False , handle_unknown = 'ignore' )
63
+ ordinal_encoder = OrdinalEncoder (handle_unknown = 'use_encoded_value' ,
64
+ unknown_value = - 1 )
74
65
categorical_imputer = SimpleImputer (strategy = 'constant' , copy = False )
75
66
76
- # Numerical Preprocessors
77
- numerical_imputer = SimpleImputer (strategy = 'median' , copy = False )
78
- standard_scaler = StandardScaler (with_mean = True , with_std = True , copy = False )
79
-
80
- preprocessors ['categorical' ] = [categorical_imputer , onehot_encoder ]
81
- preprocessors ['numerical' ] = [numerical_imputer , standard_scaler ]
67
+ preprocessors ['categorical' ] = [categorical_imputer , ordinal_encoder ]
82
68
83
69
return preprocessors
84
70
@@ -176,7 +162,16 @@ def _fit(
176
162
if ispandas (X ) and not issparse (X ):
177
163
X = cast (pd .DataFrame , X )
178
164
179
- self .all_nan_columns = set ([column for column in X .columns if X [column ].isna ().all ()])
165
+ all_nan_columns = X .columns [X .isna ().all ()]
166
+ for col in all_nan_columns :
167
+ X [col ] = pd .to_numeric (X [col ])
168
+
169
+ # Handle objects if possible
170
+ exist_object_columns = has_object_columns (X .dtypes .values )
171
+ if exist_object_columns :
172
+ X = self .infer_objects (X )
173
+ self .dtypes = [dt .name for dt in X .dtypes ] # Also note this change in self.dtypes
174
+ self .all_nan_columns = set (all_nan_columns )
180
175
181
176
self .transformed_columns , self .feat_types = self .get_columns_to_encode (X )
182
177
@@ -188,18 +183,33 @@ def _fit(
188
183
categorical_columns = self .transformed_columns ,
189
184
)
190
185
191
- # Mypy redefinition
192
- assert self .column_transformer is not None
193
- self .column_transformer .fit (X )
186
+ if len (self .enc_columns ) > 0 :
194
187
195
- # The column transformer reorders the feature types
196
- # therefore, we need to change the order of columns as well
197
- # This means categorical columns are shifted to the left
188
+ preprocessors = get_tabular_preprocessors ()
189
+ self .column_transformer = _create_column_transformer (
190
+ preprocessors = preprocessors ,
191
+ categorical_columns = self .enc_columns ,
192
+ )
198
193
199
- self .feat_types = sorted (
200
- self .feat_types ,
201
- key = functools .cmp_to_key (self ._comparator )
202
- )
194
+ # Mypy redefinition
195
+ assert self .column_transformer is not None
196
+ self .column_transformer .fit (X )
197
+
198
+ # The column transformer moves categorical columns before all numerical columns
199
+ # therefore, we need to sort categorical columns so that it complies this change
200
+
201
+ self .feat_types = sorted (
202
+ self .feat_types ,
203
+ key = functools .cmp_to_key (self ._comparator )
204
+ )
205
+
206
+ encoded_categories = self .column_transformer .\
207
+ named_transformers_ ['categorical_pipeline' ].\
208
+ named_steps ['ordinalencoder' ].categories_
209
+ self .categories = [
210
+ list (range (len (cat )))
211
+ for cat in encoded_categories
212
+ ]
203
213
204
214
# differently to categorical_columns and numerical_columns,
205
215
# this saves the index of the column.
@@ -279,6 +289,23 @@ def transform(
279
289
if ispandas (X ) and not issparse (X ):
280
290
X = cast (pd .DataFrame , X )
281
291
292
+ if self .all_nan_columns is None :
293
+ raise ValueError ('_fit must be called before calling transform' )
294
+
295
+ for col in list (self .all_nan_columns ):
296
+ X [col ] = np .nan
297
+ X [col ] = pd .to_numeric (X [col ])
298
+
299
+ if len (self .categorical_columns ) > 0 :
300
+ # when some categorical columns are not all nan in the training set
301
+ # but they are all nan in the testing or validation set
302
+ # we change those columns to `object` dtype
303
+ # to ensure that these columns are changed to appropriate dtype
304
+ # in self.infer_objects
305
+ all_nan_cat_cols = set (X [self .enc_columns ].columns [X [self .enc_columns ].isna ().all ()])
306
+ dtype_dict = {col : 'object' for col in self .enc_columns if col in all_nan_cat_cols }
307
+ X = X .astype (dtype_dict )
308
+
282
309
# Check the data here so we catch problems on new test data
283
310
self ._check_data (X )
284
311
@@ -287,11 +314,6 @@ def transform(
287
314
# We need to convert the column in test data to
288
315
# object otherwise the test column is interpreted as float
289
316
if self .column_transformer is not None :
290
- if len (self .categorical_columns ) > 0 :
291
- categorical_columns = self .column_transformer .transformers_ [0 ][- 1 ]
292
- for column in categorical_columns :
293
- if X [column ].isna ().all ():
294
- X [column ] = X [column ].astype ('object' )
295
317
X = self .column_transformer .transform (X )
296
318
297
319
# Sparse related transformations
@@ -380,7 +402,6 @@ def _check_data(
380
402
self .column_order = column_order
381
403
382
404
dtypes = [dtype .name for dtype in X .dtypes ]
383
-
384
405
diff_cols = X .columns [[s_dtype != dtype for s_dtype , dtype in zip (self .dtypes , dtypes )]]
385
406
if len (self .dtypes ) == 0 :
386
407
self .dtypes = dtypes
@@ -448,7 +469,7 @@ def _validate_feat_types(self, X: pd.DataFrame) -> None:
448
469
def _get_columns_to_encode (
449
470
self ,
450
471
X : pd .DataFrame ,
451
- ) -> Tuple [List [str ], List [str ], List [ str ] ]:
472
+ ) -> Tuple [List [str ], List [str ]]:
452
473
"""
453
474
Return the columns to be transformed as well as
454
475
the type of feature for each column from a pandas dataframe.
@@ -478,8 +499,8 @@ def _get_columns_to_encode(
478
499
# Also, register the feature types for the estimator
479
500
feat_types = []
480
501
481
- # Make sure each column is a valid type
482
- for column in X .columns :
502
+ # Make sure each column is a valid type
503
+ for i , column in enumerate ( X .columns ) :
483
504
if self .all_nan_columns is not None and column in self .all_nan_columns :
484
505
continue
485
506
column_dtype = self .dtypes [i ]
@@ -592,22 +613,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
592
613
pd.DataFrame
593
614
"""
594
615
if hasattr (self , 'object_dtype_mapping' ):
595
- # Mypy does not process the has attr. This dict is defined below
596
- for key , dtype in self . object_dtype_mapping . items (): # type: ignore[has-type]
597
- # honor the training data types
598
- try :
599
- X [ key ] = X [ key ]. astype ( dtype . name )
600
- except Exception as e :
601
- # Try inference if possible
602
- self . logger . warning ( f'Casting the column { key } to { dtype } caused the exception { e } ' )
603
- pass
616
+ # honor the training data types
617
+ try :
618
+ # Mypy does not process the has attr.
619
+ X = X . astype ( self . object_dtype_mapping ) # type: ignore[has-type]
620
+ except Exception as e :
621
+ # Try inference if possible
622
+ self . logger . warning ( f'Casting the columns to training dtypes ' # type: ignore[has-type]
623
+ f' { self . object_dtype_mapping } caused the exception { e } ' )
624
+ pass
604
625
else :
605
- # Calling for the first time to infer the categories
606
- X = X .infer_objects ()
607
- for column , data_type in zip (X .columns , X .dtypes ):
608
- if not is_numeric_dtype (data_type ):
609
- X [column ] = X [column ].astype ('category' )
610
-
626
+ if len (self .dtypes ) != 0 :
627
+ # when train data has no object dtype, but test does
628
+ # we prioritise the datatype given in training data
629
+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
630
+ X = X .astype (dtype_dict )
631
+ else :
632
+ # Calling for the first time to infer the categories
633
+ X = X .infer_objects ()
634
+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
635
+ X = X .astype (dtype_dict )
611
636
# only numerical attributes and categories
612
637
self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
613
638
0 commit comments