@@ -57,22 +57,13 @@ def _fit(
57
57
if len (self .dtypes ) != 0 :
58
58
self .dtypes [list (X .columns ).index (column )] = X [column ].dtype
59
59
60
+ if not X .select_dtypes (include = 'object' ).empty :
61
+ X = self .infer_objects (X )
62
+
60
63
self .enc_columns , self .feat_type = self ._get_columns_to_encode (X )
61
64
62
65
if len (self .enc_columns ) > 0 :
63
- # impute missing values before encoding,
64
- # remove once sklearn natively supports
65
- # it in ordinal encoding. Sklearn issue:
66
- # "https://github.com/scikit-learn/scikit-learn/issues/17123)"
67
- for column in self .enc_columns :
68
- if X [column ].isna ().any ():
69
- missing_value : typing .Union [int , str ] = - 1
70
- # make sure for a string column we give
71
- # string missing value else we give numeric
72
- if type (X [column ][0 ]) == str :
73
- missing_value = str (missing_value )
74
- X [column ] = X [column ].cat .add_categories ([missing_value ])
75
- X [column ] = X [column ].fillna (missing_value )
66
+ X = self .impute_nan_in_categories (X )
76
67
77
68
self .encoder = ColumnTransformer (
78
69
[
@@ -160,6 +151,10 @@ def transform(
160
151
if X [column ].isna ().all ():
161
152
X [column ] = pd .to_numeric (X [column ])
162
153
154
+ # Also remove the object dtype for new data
155
+ if not X .select_dtypes (include = 'object' ).empty :
156
+ X = self .infer_objects (X )
157
+
163
158
# Check the data here so we catch problems on new test data
164
159
self ._check_data (X )
165
160
@@ -172,18 +167,32 @@ def transform(
172
167
for column in X .columns :
173
168
if X [column ].isna ().all ():
174
169
X [column ] = pd .to_numeric (X [column ])
170
+
171
+ # We also need to fillna on the transformation
172
+ # in case test data is provided
173
+ X = self .impute_nan_in_categories (X )
174
+
175
175
X = self .encoder .transform (X )
176
176
177
177
# Sparse related transformations
178
178
# Not all sparse format support index sorting
179
179
if scipy .sparse .issparse (X ) and hasattr (X , 'sort_indices' ):
180
180
X .sort_indices ()
181
181
182
- return sklearn .utils .check_array (
183
- X ,
184
- force_all_finite = False ,
185
- accept_sparse = 'csr'
186
- )
182
+ try :
183
+ X = sklearn .utils .check_array (
184
+ X ,
185
+ force_all_finite = False ,
186
+ accept_sparse = 'csr'
187
+ )
188
+ except Exception as e :
189
+ self .logger .exception (f"Conversion failed for input { X .dtypes } { X } "
190
+ "This means AutoPyTorch was not able to properly "
191
+ "Extract the dtypes of the provided input features. "
192
+ "Please try to manually cast it to a supported "
193
+ "numerical or categorical values." )
194
+ raise e
195
+ return X
187
196
188
197
def _check_data (
189
198
self ,
@@ -231,6 +240,10 @@ def _check_data(
231
240
# If entered here, we have a pandas dataframe
232
241
X = typing .cast (pd .DataFrame , X )
233
242
243
+ # Handle objects if possible
244
+ if not X .select_dtypes (include = 'object' ).empty :
245
+ X = self .infer_objects (X )
246
+
234
247
# Define the column to be encoded here as the feature validator is fitted once
235
248
# per estimator
236
249
enc_columns , _ = self ._get_columns_to_encode (X )
@@ -245,6 +258,7 @@ def _check_data(
245
258
)
246
259
else :
247
260
self .column_order = column_order
261
+
248
262
dtypes = [dtype .name for dtype in X .dtypes ]
249
263
if len (self .dtypes ) > 0 :
250
264
if self .dtypes != dtypes :
@@ -379,3 +393,96 @@ def numpy_array_to_pandas(
379
393
pd.DataFrame
380
394
"""
381
395
return pd .DataFrame (X ).infer_objects ().convert_dtypes ()
396
+
397
+ def infer_objects (self , X : pd .DataFrame ) -> pd .DataFrame :
398
+ """
399
+ In case the input contains object columns, their type is inferred if possible
400
+
401
+ This has to be done once, so the test and train data are treated equally
402
+
403
+ Arguments:
404
+ X (pd.DataFrame):
405
+ data to be interpreted.
406
+
407
+ Returns:
408
+ pd.DataFrame
409
+ """
410
+ if hasattr (self , 'object_dtype_mapping' ):
411
+ # Mypy does not process the has attr. This dict is defined below
412
+ for key , dtype in self .object_dtype_mapping .items (): # type: ignore[has-type]
413
+ if 'int' in dtype .name :
414
+ # In the case train data was interpreted as int
415
+ # and test data was interpreted as float, because of 0.0
416
+ # for example, honor training data
417
+ X [key ] = X [key ].applymap (np .int64 )
418
+ else :
419
+ try :
420
+ X [key ] = X [key ].astype (dtype .name )
421
+ except Exception as e :
422
+ # Try inference if possible
423
+ self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
424
+ pass
425
+ else :
426
+ X = X .infer_objects ()
427
+ for column in X .columns :
428
+ if not is_numeric_dtype (X [column ]):
429
+ X [column ] = X [column ].astype ('category' )
430
+ self .object_dtype_mapping = {column : X [column ].dtype for column in X .columns }
431
+ self .logger .debug (f"Infer Objects: { self .object_dtype_mapping } " )
432
+ return X
433
+
434
+ def impute_nan_in_categories (self , X : pd .DataFrame ) -> pd .DataFrame :
435
+ """
436
+ impute missing values before encoding,
437
+ remove once sklearn natively supports
438
+ it in ordinal encoding. Sklearn issue:
439
+ "https://github.com/scikit-learn/scikit-learn/issues/17123)"
440
+
441
+ Arguments:
442
+ X (pd.DataFrame):
443
+ data to be interpreted.
444
+
445
+ Returns:
446
+ pd.DataFrame
447
+ """
448
+
449
+ # To be on the safe side, map always to the same missing
450
+ # value per column
451
+ if not hasattr (self , 'dict_nancol_to_missing' ):
452
+ self .dict_missing_value_per_col : typing .Dict [str , typing .Any ] = {}
453
+
454
+ # First make sure that we do not alter the type of the column which cause:
455
+ # TypeError: '<' not supported between instances of 'int' and 'str'
456
+ # in the encoding
457
+ for column in self .enc_columns :
458
+ if X [column ].isna ().any ():
459
+ if column not in self .dict_missing_value_per_col :
460
+ try :
461
+ float (X [column ].dropna ().values [0 ])
462
+ can_cast_as_number = True
463
+ except Exception :
464
+ can_cast_as_number = False
465
+ if can_cast_as_number :
466
+ # In this case, we expect to have a number as category
467
+ # it might be string, but its value represent a number
468
+ missing_value : typing .Union [str , int ] = '-1' if isinstance (X [column ].dropna ().values [0 ],
469
+ str ) else - 1
470
+ else :
471
+ missing_value = 'Missing!'
472
+
473
+ # Make sure this missing value is not seen before
474
+ # Do this check for categorical columns
475
+ # else modify the value
476
+ if hasattr (X [column ], 'cat' ):
477
+ while missing_value in X [column ].cat .categories :
478
+ if isinstance (missing_value , str ):
479
+ missing_value += '0'
480
+ else :
481
+ missing_value += missing_value
482
+ self .dict_missing_value_per_col [column ] = missing_value
483
+
484
+ # Convert the frame in place
485
+ X [column ].cat .add_categories ([self .dict_missing_value_per_col [column ]],
486
+ inplace = True )
487
+ X .fillna ({column : self .dict_missing_value_per_col [column ]}, inplace = True )
488
+ return X
0 commit comments