11
11
import sklearn .utils
12
12
from sklearn import preprocessing
13
13
from sklearn .base import BaseEstimator
14
- from sklearn .compose import make_column_transformer
14
+ from sklearn .compose import ColumnTransformer
15
15
from sklearn .exceptions import NotFittedError
16
16
17
17
from autoPyTorch .data .base_feature_validator import BaseFeatureValidator , SUPPORTED_FEAT_TYPES
@@ -53,16 +53,34 @@ def _fit(
53
53
for column in X .columns :
54
54
if X [column ].isna ().all ():
55
55
X [column ] = pd .to_numeric (X [column ])
56
+ # Also note this change in self.dtypes
57
+ if len (self .dtypes ) != 0 :
58
+ self .dtypes [list (X .columns ).index (column )] = X [column ].dtype
56
59
57
60
self .enc_columns , self .feat_type = self ._get_columns_to_encode (X )
58
61
59
62
if len (self .enc_columns ) > 0 :
60
-
61
- self .encoder = make_column_transformer (
62
- (preprocessing .OrdinalEncoder (
63
- handle_unknown = 'use_encoded_value' ,
64
- unknown_value = - 1 ,
65
- ), self .enc_columns ),
63
+ # impute missing values before encoding,
64
+ # remove once sklearn natively supports
65
+ # it in ordinal encoding. Sklearn issue:
66
+ # "https://github.com/scikit-learn/scikit-learn/issues/17123)"
67
+ for column in self .enc_columns :
68
+ if X [column ].isna ().any ():
69
+ missing_value : typing .Union [int , str ] = - 1
70
+ # make sure for a string column we give
71
+ # string missing value else we give numeric
72
+ if type (X [column ][0 ]) == str :
73
+ missing_value = str (missing_value )
74
+ X [column ] = X [column ].cat .add_categories ([missing_value ])
75
+ X [column ] = X [column ].fillna (missing_value )
76
+
77
+ self .encoder = ColumnTransformer (
78
+ [
79
+ ("encoder" ,
80
+ preprocessing .OrdinalEncoder (
81
+ handle_unknown = 'use_encoded_value' ,
82
+ unknown_value = - 1 ,
83
+ ), self .enc_columns )],
66
84
remainder = "passthrough"
67
85
)
68
86
@@ -85,6 +103,7 @@ def comparator(cmp1: str, cmp2: str) -> int:
85
103
return 1
86
104
else :
87
105
raise ValueError ((cmp1 , cmp2 ))
106
+
88
107
self .feat_type = sorted (
89
108
self .feat_type ,
90
109
key = functools .cmp_to_key (comparator )
@@ -182,9 +201,8 @@ def _check_data(
182
201
if not isinstance (X , (np .ndarray , pd .DataFrame )) and not scipy .sparse .issparse (X ):
183
202
raise ValueError ("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
184
203
" scipy sparse and Python Lists, yet, the provided input is"
185
- " of type {}" .format (
186
- type (X )
187
- ))
204
+ " of type {}" .format (type (X ))
205
+ )
188
206
189
207
if self .data_type is None :
190
208
self .data_type = type (X )
@@ -217,39 +235,25 @@ def _check_data(
217
235
# per estimator
218
236
enc_columns , _ = self ._get_columns_to_encode (X )
219
237
220
- if len (enc_columns ) > 0 :
221
- if np .any (pd .isnull (
222
- X [enc_columns ].dropna ( # type: ignore[call-overload]
223
- axis = 'columns' , how = 'all' )
224
- )):
225
- # Ignore all NaN columns, and if still a NaN
226
- # Error out
227
- raise ValueError ("Categorical features in a dataframe cannot contain "
228
- "missing/NaN values. The OrdinalEncoder used by "
229
- "AutoPyTorch cannot handle this yet (due to a "
230
- "limitation on scikit-learn being addressed via: "
231
- "https://github.com/scikit-learn/scikit-learn/issues/17123)"
232
- )
233
238
column_order = [column for column in X .columns ]
234
239
if len (self .column_order ) > 0 :
235
240
if self .column_order != column_order :
236
241
raise ValueError ("Changing the column order of the features after fit() is "
237
242
"not supported. Fit() method was called with "
238
- "{} whereas the new features have {} as type" .format (
239
- self .column_order ,
240
- column_order ,
241
- ))
243
+ "{} whereas the new features have {} as type" .format (self .column_order ,
244
+ column_order ,)
245
+ )
242
246
else :
243
247
self .column_order = column_order
244
248
dtypes = [dtype .name for dtype in X .dtypes ]
245
249
if len (self .dtypes ) > 0 :
246
250
if self .dtypes != dtypes :
247
251
raise ValueError ("Changing the dtype of the features after fit() is "
248
252
"not supported. Fit() method was called with "
249
- "{} whereas the new features have {} as type" .format (
250
- self . dtypes ,
251
- dtypes ,
252
- ))
253
+ "{} whereas the new features have {} as type" .format (self . dtypes ,
254
+ dtypes ,
255
+ )
256
+ )
253
257
else :
254
258
self .dtypes = dtypes
255
259
@@ -294,7 +298,8 @@ def _get_columns_to_encode(
294
298
"pandas.Series.astype ."
295
299
"If working with string objects, the following "
296
300
"tutorial illustrates how to work with text data: "
297
- "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html" .format ( # noqa: E501
301
+ "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html" .format (
302
+ # noqa: E501
298
303
column ,
299
304
)
300
305
)
@@ -349,15 +354,13 @@ def list_to_dataframe(
349
354
# If a list was provided, it will be converted to pandas
350
355
X_train = pd .DataFrame (data = X_train ).infer_objects ()
351
356
self .logger .warning ("The provided feature types to AutoPyTorch are of type list."
352
- "Features have been interpreted as: {}" .format (
353
- [(col , t ) for col , t in zip (X_train .columns , X_train .dtypes )]
354
- ))
357
+ "Features have been interpreted as: {}" .format ([(col , t ) for col , t in
358
+ zip (X_train .columns , X_train .dtypes )]))
355
359
if X_test is not None :
356
360
if not isinstance (X_test , list ):
357
361
self .logger .warning ("Train features are a list while the provided test data"
358
- "is {}. X_test will be casted as DataFrame." .format (
359
- type (X_test )
360
- ))
362
+ "is {}. X_test will be casted as DataFrame." .format (type (X_test ))
363
+ )
361
364
X_test = pd .DataFrame (data = X_test ).infer_objects ()
362
365
return X_train , X_test
363
366
0 commit comments