@@ -189,6 +189,29 @@ def _convert_all_nan_columns_to_numeric(self, X: pd.DataFrame, fit: bool = False
189
189
190
190
return X
191
191
192
+ def _encode_categories (self , X : pd .DataFrame ) -> None :
193
+ preprocessors = get_tabular_preprocessors ()
194
+ self .column_transformer = _create_column_transformer (
195
+ preprocessors = preprocessors ,
196
+ categorical_columns = self .enc_columns ,
197
+ )
198
+
199
+ assert self .column_transformer is not None # Mypy redefinition
200
+ self .column_transformer .fit (X )
201
+
202
+ # The column transformer moves categoricals to the left side
203
+ self .feat_type = sorted (self .feat_type , key = functools .cmp_to_key (self ._comparator ))
204
+
205
+ encoded_categories = self .column_transformer .\
206
+ named_transformers_ ['categorical_pipeline' ].\
207
+ named_steps ['ordinalencoder' ].categories_
208
+
209
+ # An ordinal encoder for each categorical columns
210
+ self .categories = [
211
+ list (range (len (cat )))
212
+ for cat in encoded_categories
213
+ ]
214
+
192
215
def _fit (self , X : SupportedFeatTypes ) -> BaseEstimator :
193
216
"""
194
217
In case input data is a pandas DataFrame, this utility encodes the user provided
@@ -216,44 +239,15 @@ def _fit(self, X: SupportedFeatTypes) -> BaseEstimator:
216
239
self .enc_columns , self .feat_type = self ._get_columns_to_encode (X )
217
240
218
241
assert self .feat_type is not None
219
-
220
242
if len (self .enc_columns ) > 0 :
221
-
222
- preprocessors = get_tabular_preprocessors ()
223
- self .column_transformer = _create_column_transformer (
224
- preprocessors = preprocessors ,
225
- categorical_columns = self .enc_columns ,
226
- )
227
-
228
- # Mypy redefinition
229
- assert self .column_transformer is not None
230
- self .column_transformer .fit (X )
231
-
232
- # The column transformer reorders the feature types
233
- # therefore, we need to change the order of columns as well
234
- # This means categorical columns are shifted to the left
235
- self .feat_type = sorted (
236
- self .feat_type ,
237
- key = functools .cmp_to_key (self ._comparator )
238
- )
239
-
240
- encoded_categories = self .column_transformer .\
241
- named_transformers_ ['categorical_pipeline' ].\
242
- named_steps ['ordinalencoder' ].categories_
243
- self .categories = [
244
- # We fit an ordinal encoder, where all categorical
245
- # columns are shifted to the left
246
- list (range (len (cat )))
247
- for cat in encoded_categories
248
- ]
243
+ self ._encode_categories (X )
249
244
250
245
for i , type_ in enumerate (self .feat_type ):
251
246
if 'numerical' in type_ :
252
247
self .numerical_columns .append (i )
253
248
else :
254
249
self .categorical_columns .append (i )
255
250
256
- # Lastly, store the number of features
257
251
self .num_features = np .shape (X )[1 ]
258
252
return self
259
253
@@ -270,6 +264,41 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
270
264
Return:
271
265
np.ndarray:
272
266
The transformed array
267
+
268
+ Note:
269
+ The default transform performs the folloing:
270
+ * simple imputation for both
271
+ * scaling for numerical
272
+ * one-hot encoding for categorical
273
+ For example, here is a simple case
274
+ of which all the columns are categorical.
275
+ data = [
276
+ {'A': 1, 'B': np.nan, 'C': np.nan},
277
+ {'A': np.nan, 'B': 3, 'C': np.nan},
278
+ {'A': 2, 'B': np.nan, 'C': np.nan}
279
+ ]
280
+ and suppose all the columns are categorical,
281
+ then
282
+ * `A` in {np.nan, 1, 2}
283
+ * `B` in {np.nan, 3}
284
+ * `C` in {np.nan} <=== it will be dropped.
285
+
286
+ So in the column A,
287
+ * np.nan ==> [1, 0, 0] (always the index 0)
288
+ * 1 ==> [0, 1, 0]
289
+ * 2 ==> [0, 0, 1]
290
+ in the column B,
291
+ * np.nan ==> [1, 0]
292
+ * 3 ==> [0, 1]
293
+ Therefore, by concatenating,
294
+ * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
295
+ * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
296
+ * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
297
+ ==> [
298
+ [0, 1, 0, 1, 0],
299
+ [1, 0, 0, 0, 1],
300
+ [0, 0, 1, 1, 0]
301
+ ]
273
302
"""
274
303
if not self ._is_fitted :
275
304
raise NotFittedError ("Cannot call transform on a validator that is not fitted" )
@@ -288,14 +317,6 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
288
317
289
318
# Pandas related transformations
290
319
if ispandas (X ) and self .column_transformer is not None :
291
- if np .any (pd .isnull (X )):
292
- # After above check it means that if there is a NaN
293
- # the whole column must be NaN
294
- # Make sure it is numerical and let the pipeline handle it
295
- for column in X .columns :
296
- if X [column ].isna ().all ():
297
- X [column ] = pd .to_numeric (X [column ])
298
-
299
320
X = self .column_transformer .transform (X )
300
321
301
322
# Sparse related transformations
@@ -304,17 +325,15 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
304
325
X .sort_indices ()
305
326
306
327
try :
307
- X = sklearn .utils .check_array (
308
- X ,
309
- force_all_finite = False ,
310
- accept_sparse = 'csr'
311
- )
328
+ X = sklearn .utils .check_array (X , force_all_finite = False , accept_sparse = 'csr' )
312
329
except Exception as e :
313
- self .logger .exception (f"Conversion failed for input { X .dtypes } { X } "
314
- "This means AutoPyTorch was not able to properly "
315
- "Extract the dtypes of the provided input features. "
316
- "Please try to manually cast it to a supported "
317
- "numerical or categorical values." )
330
+ self .logger .exception (
331
+ f"Conversion failed for input { X .dtypes } { X } "
332
+ "This means AutoPyTorch was not able to properly "
333
+ "Extract the dtypes of the provided input features. "
334
+ "Please try to manually cast it to a supported "
335
+ "numerical or categorical values."
336
+ )
318
337
raise e
319
338
320
339
X = self ._compress_dataset (X )
@@ -328,7 +347,6 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio
328
347
the testing data is converted to the same dtype as
329
348
the training data.
330
349
331
-
332
350
Args:
333
351
X (DatasetCompressionInputType):
334
352
Dataset
@@ -510,27 +528,31 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
510
528
pd.DataFrame
511
529
"""
512
530
if hasattr (self , 'object_dtype_mapping' ):
513
- # Mypy does not process the has attr. This dict is defined below
514
- for key , dtype in self .object_dtype_mapping .items (): # type: ignore[has-type]
515
- if 'int' in dtype .name :
516
- # In the case train data was interpreted as int
517
- # and test data was interpreted as float, because of 0.0
518
- # for example, honor training data
519
- X [key ] = X [key ].applymap (np .int64 )
520
- else :
521
- try :
522
- X [key ] = X [key ].astype (dtype .name )
523
- except Exception as e :
524
- # Try inference if possible
525
- self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
526
- pass
531
+ # honor the training data types
532
+ try :
533
+ # Mypy does not process the has attr.
534
+ X = X .astype (self .object_dtype_mapping ) # type: ignore[has-type]
535
+ except Exception as e :
536
+ # Try inference if possible
537
+ self .logger .warning (f'Casting the columns to training dtypes ' # type: ignore[has-type]
538
+ f'{ self .object_dtype_mapping } caused the exception { e } ' )
539
+ pass
527
540
else :
528
- X = X .infer_objects ()
529
- for column in X .columns :
530
- if not is_numeric_dtype (X [column ]):
531
- X [column ] = X [column ].astype ('category' )
532
- self .object_dtype_mapping = {column : X [column ].dtype for column in X .columns }
541
+ if len (self .dtypes ) != 0 :
542
+ # when train data has no object dtype, but test does
543
+ # we prioritise the datatype given in training data
544
+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
545
+ X = X .astype (dtype_dict )
546
+ else :
547
+ # Calling for the first time to infer the categories
548
+ X = X .infer_objects ()
549
+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
550
+ X = X .astype (dtype_dict )
551
+ # only numerical attributes and categories
552
+ self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
553
+
533
554
self .logger .debug (f"Infer Objects: { self .object_dtype_mapping } " )
555
+
534
556
return X
535
557
536
558
0 commit comments