@@ -163,9 +163,13 @@ def _fit(
163
163
# with nan values.
164
164
# Columns that are completely made of NaN values are provided to the pipeline
165
165
# so that later stages decide how to handle them
166
+
167
+ # Clear whatever null column markers we had previously
168
+ self .null_columns .clear ()
166
169
if np .any (pd .isnull (X )):
167
170
for column in X .columns :
168
171
if X [column ].isna ().all ():
172
+ self .null_columns .add (column )
169
173
X [column ] = pd .to_numeric (X [column ])
170
174
# Also note this change in self.dtypes
171
175
if len (self .dtypes ) != 0 :
@@ -174,9 +178,8 @@ def _fit(
174
178
if not X .select_dtypes (include = 'object' ).empty :
175
179
X = self .infer_objects (X )
176
180
177
- self .transformed_columns , self .feat_type = self ._get_columns_to_encode (X )
178
-
179
- assert self .feat_type is not None
181
+ self ._check_data (X )
182
+ self .enc_columns , self .feat_type = self ._get_columns_to_encode (X )
180
183
181
184
if len (self .transformed_columns ) > 0 :
182
185
@@ -246,29 +249,37 @@ def transform(
246
249
X = self .numpy_array_to_pandas (X )
247
250
248
251
if hasattr (X , "iloc" ) and not issparse (X ):
249
- if np .any (pd .isnull (X )):
250
- for column in X .columns :
251
- if X [column ].isna ().all ():
252
- X [column ] = pd .to_numeric (X [column ])
252
+ X = cast (pd .DataFrame , X )
253
+ # If we had null columns in our fit call and we made them numeric, then:
254
+ # - If the columns are null even in transform, apply the same procedure.
255
+ # - Otherwise, substitute the values with np.NaN and then make the columns numeric.
256
+ # If the column is null here, but it was not in fit, it does not matter.
257
+ for column in self .null_columns :
258
+ # The column is not null, make it null since it was null in fit.
259
+ if not X [column ].isna ().all ():
260
+ X [column ] = np .NaN
261
+ X [column ] = pd .to_numeric (X [column ])
262
+
263
+ # for the test set, if we have columns with only null values
264
+ # they will probably have a numeric type. If these columns were not
265
+ # with only null values in the train set, they should be converted
266
+ # to the type that they had during fitting.
267
+ for column in X .columns :
268
+ if X [column ].isna ().all ():
269
+ X [column ] = X [column ].astype (self .dtypes [list (X .columns ).index (column )])
253
270
254
271
# Also remove the object dtype for new data
255
272
if not X .select_dtypes (include = 'object' ).empty :
256
273
X = self .infer_objects (X )
257
274
258
275
# Check the data here so we catch problems on new test data
259
276
self ._check_data (X )
277
+ # We also need to fillna on the transformation
278
+ # in case test data is provided
279
+ X = self .impute_nan_in_categories (X )
260
280
261
- # Pandas related transformations
262
- if hasattr (X , "iloc" ) and self .column_transformer is not None :
263
- if np .any (pd .isnull (X )):
264
- # After above check it means that if there is a NaN
265
- # the whole column must be NaN
266
- # Make sure it is numerical and let the pipeline handle it
267
- for column in X .columns :
268
- if X [column ].isna ().all ():
269
- X [column ] = pd .to_numeric (X [column ])
270
-
271
- X = self .column_transformer .transform (X )
281
+ if self .encoder is not None :
282
+ X = self .encoder .transform (X )
272
283
273
284
# Sparse related transformations
274
285
# Not all sparse format support index sorting
@@ -525,7 +536,7 @@ def numpy_array_to_pandas(
525
536
Returns:
526
537
pd.DataFrame
527
538
"""
528
- return pd .DataFrame (X ).infer_objects (). convert_dtypes ()
539
+ return pd .DataFrame (X ).convert_dtypes ()
529
540
530
541
def infer_objects (self , X : pd .DataFrame ) -> pd .DataFrame :
531
542
"""
@@ -543,18 +554,13 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
543
554
if hasattr (self , 'object_dtype_mapping' ):
544
555
# Mypy does not process the has attr. This dict is defined below
545
556
for key , dtype in self .object_dtype_mapping .items (): # type: ignore[has-type]
546
- if 'int' in dtype .name :
547
- # In the case train data was interpreted as int
548
- # and test data was interpreted as float, because of 0.0
549
- # for example, honor training data
550
- X [key ] = X [key ].applymap (np .int64 )
551
- else :
552
- try :
553
- X [key ] = X [key ].astype (dtype .name )
554
- except Exception as e :
555
- # Try inference if possible
556
- self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
557
- pass
557
+ # honor the training data types
558
+ try :
559
+ X [key ] = X [key ].astype (dtype .name )
560
+ except Exception as e :
561
+ # Try inference if possible
562
+ self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
563
+ pass
558
564
else :
559
565
X = X .infer_objects ()
560
566
for column in X .columns :
0 commit comments