handle nans in categorical columns (#118)

ravinkohli · web-flow · commit ffc162031a6c · 2021-03-01T17:23:57.000+01:00
* handle nans in categorical columns

* Fixed error in self dtypes

* Addressed comments from francisco

* Forgot to commit

* Fix flake
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -11,7 +11,7 @@
 import sklearn.utils
 from sklearn import preprocessing
 from sklearn.base import BaseEstimator
-from sklearn.compose import make_column_transformer
+from sklearn.compose import ColumnTransformer
 from sklearn.exceptions import NotFittedError
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
@@ -53,16 +53,34 @@ def _fit(
                 for column in X.columns:
                     if X[column].isna().all():
                         X[column] = pd.to_numeric(X[column])
+                        # Also note this change in self.dtypes
+                        if len(self.dtypes) != 0:
+                            self.dtypes[list(X.columns).index(column)] = X[column].dtype
 
             self.enc_columns, self.feat_type = self._get_columns_to_encode(X)
 
             if len(self.enc_columns) > 0:
-
-                self.encoder = make_column_transformer(
-                    (preprocessing.OrdinalEncoder(
-                        handle_unknown='use_encoded_value',
-                        unknown_value=-1,
-                    ), self.enc_columns),
+                # impute missing values before encoding,
+                # remove once sklearn natively supports
+                # it in ordinal encoding. Sklearn issue:
+                # "https://github.com/scikit-learn/scikit-learn/issues/17123)"
+                for column in self.enc_columns:
+                    if X[column].isna().any():
+                        missing_value: typing.Union[int, str] = -1
+                        # make sure for a string column we give
+                        # string missing value else we give numeric
+                        if type(X[column][0]) == str:
+                            missing_value = str(missing_value)
+                        X[column] = X[column].cat.add_categories([missing_value])
+                        X[column] = X[column].fillna(missing_value)
+
+                self.encoder = ColumnTransformer(
+                    [
+                        ("encoder",
+                         preprocessing.OrdinalEncoder(
+                             handle_unknown='use_encoded_value',
+                             unknown_value=-1,
+                         ), self.enc_columns)],
                     remainder="passthrough"
                 )
 
@@ -85,6 +103,7 @@ def comparator(cmp1: str, cmp2: str) -> int:
                         return 1
                     else:
                         raise ValueError((cmp1, cmp2))
+
                 self.feat_type = sorted(
                     self.feat_type,
                     key=functools.cmp_to_key(comparator)
@@ -182,9 +201,8 @@ def _check_data(
         if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X):
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " scipy sparse and Python Lists, yet, the provided input is"
-                             " of type {}".format(
-                                 type(X)
-                             ))
+                             " of type {}".format(type(X))
+                             )
 
         if self.data_type is None:
             self.data_type = type(X)
@@ -217,39 +235,25 @@ def _check_data(
             # per estimator
             enc_columns, _ = self._get_columns_to_encode(X)
 
-            if len(enc_columns) > 0:
-                if np.any(pd.isnull(
-                    X[enc_columns].dropna(  # type: ignore[call-overload]
-                        axis='columns', how='all')
-                )):
-                    # Ignore all NaN columns, and if still a NaN
-                    # Error out
-                    raise ValueError("Categorical features in a dataframe cannot contain "
-                                     "missing/NaN values. The OrdinalEncoder used by "
-                                     "AutoPyTorch cannot handle this yet (due to a "
-                                     "limitation on scikit-learn being addressed via: "
-                                     "https://github.com/scikit-learn/scikit-learn/issues/17123)"
-                                     )
             column_order = [column for column in X.columns]
             if len(self.column_order) > 0:
                 if self.column_order != column_order:
                     raise ValueError("Changing the column order of the features after fit() is "
                                      "not supported. Fit() method was called with "
-                                     "{} whereas the new features have {} as type".format(
-                                         self.column_order,
-                                         column_order,
-                                     ))
+                                     "{} whereas the new features have {} as type".format(self.column_order,
+                                                                                          column_order,)
+                                     )
             else:
                 self.column_order = column_order
             dtypes = [dtype.name for dtype in X.dtypes]
             if len(self.dtypes) > 0:
                 if self.dtypes != dtypes:
                     raise ValueError("Changing the dtype of the features after fit() is "
                                      "not supported. Fit() method was called with "
-                                     "{} whereas the new features have {} as type".format(
-                                         self.dtypes,
-                                         dtypes,
-                                     ))
+                                     "{} whereas the new features have {} as type".format(self.dtypes,
+                                                                                          dtypes,
+                                                                                          )
+                                     )
             else:
                 self.dtypes = dtypes
 
@@ -294,7 +298,8 @@ def _get_columns_to_encode(
                         "pandas.Series.astype ."
                         "If working with string objects, the following "
                         "tutorial illustrates how to work with text data: "
-                        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(  # noqa: E501
+                        "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(
+                            # noqa: E501
                             column,
                         )
                     )
@@ -349,15 +354,13 @@ def list_to_dataframe(
         # If a list was provided, it will be converted to pandas
         X_train = pd.DataFrame(data=X_train).infer_objects()
         self.logger.warning("The provided feature types to AutoPyTorch are of type list."
-                            "Features have been interpreted as: {}".format(
-                                [(col, t) for col, t in zip(X_train.columns, X_train.dtypes)]
-                            ))
+                            "Features have been interpreted as: {}".format([(col, t) for col, t in
+                                                                            zip(X_train.columns, X_train.dtypes)]))
         if X_test is not None:
             if not isinstance(X_test, list):
                 self.logger.warning("Train features are a list while the provided test data"
-                                    "is {}. X_test will be casted as DataFrame.".format(
-                                        type(X_test)
-                                    ))
+                                    "is {}. X_test will be casted as DataFrame.".format(type(X_test))
+                                    )
             X_test = pd.DataFrame(data=X_test).infer_objects()
         return X_train, X_test
 
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
@@ -231,10 +231,17 @@ def test_featurevalidator_unsupported_numpy(input_data_featuretest):
     ),
     indirect=True
 )
-def test_featurevalidator_unsupported_pandas(input_data_featuretest):
+def test_featurevalidator_categorical_nan(input_data_featuretest):
     validator = TabularFeatureValidator()
-    with pytest.raises(ValueError, match=r"Categorical features in a dataframe.*missing/NaN"):
-        validator.fit(input_data_featuretest)
+    validator.fit(input_data_featuretest)
+    transformed_X = validator.transform(input_data_featuretest)
+    assert any(pd.isna(input_data_featuretest))
+    assert any((-1 in categories) or ('-1' in categories) for categories in
+               validator.encoder.named_transformers_['encoder'].categories_)
+    assert np.shape(input_data_featuretest) == np.shape(transformed_X)
+    assert np.issubdtype(transformed_X.dtype, np.number)
+    assert validator._is_fitted
+    assert isinstance(transformed_X, np.ndarray)
 
 
 @pytest.mark.parametrize(