Merge pull request #126 from franchuterivera/refactor_development_benchmarkproblems

ravinkohli · web-flow · commit a3d40ac08553 · 2021-03-08T14:20:25.000+01:00
Fixes to address automlbenchmark problems
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -858,8 +858,11 @@ def _search(
                 saveable_trajectory = \
                     [list(entry[:2]) + [entry[2].get_dictionary()] + list(entry[3:])
                      for entry in self.trajectory]
-                with open(trajectory_filename, 'w') as fh:
-                    json.dump(saveable_trajectory, fh)
+                try:
+                    with open(trajectory_filename, 'w') as fh:
+                        json.dump(saveable_trajectory, fh)
+                except Exception as e:
+                    self._logger.warning(f"Cannot save {trajectory_filename} due to {e}...")
             except Exception as e:
                 self._logger.exception(str(e))
                 raise
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -57,22 +57,13 @@ def _fit(
                         if len(self.dtypes) != 0:
                             self.dtypes[list(X.columns).index(column)] = X[column].dtype
 
+            if not X.select_dtypes(include='object').empty:
+                X = self.infer_objects(X)
+
             self.enc_columns, self.feat_type = self._get_columns_to_encode(X)
 
             if len(self.enc_columns) > 0:
-                # impute missing values before encoding,
-                # remove once sklearn natively supports
-                # it in ordinal encoding. Sklearn issue:
-                # "https://github.com/scikit-learn/scikit-learn/issues/17123)"
-                for column in self.enc_columns:
-                    if X[column].isna().any():
-                        missing_value: typing.Union[int, str] = -1
-                        # make sure for a string column we give
-                        # string missing value else we give numeric
-                        if type(X[column][0]) == str:
-                            missing_value = str(missing_value)
-                        X[column] = X[column].cat.add_categories([missing_value])
-                        X[column] = X[column].fillna(missing_value)
+                X = self.impute_nan_in_categories(X)
 
                 self.encoder = ColumnTransformer(
                     [
@@ -160,6 +151,10 @@ def transform(
                     if X[column].isna().all():
                         X[column] = pd.to_numeric(X[column])
 
+            # Also remove the object dtype for new data
+            if not X.select_dtypes(include='object').empty:
+                X = self.infer_objects(X)
+
         # Check the data here so we catch problems on new test data
         self._check_data(X)
 
@@ -172,18 +167,32 @@ def transform(
                 for column in X.columns:
                     if X[column].isna().all():
                         X[column] = pd.to_numeric(X[column])
+
+            # We also need to fillna on the transformation
+            # in case test data is provided
+            X = self.impute_nan_in_categories(X)
+
             X = self.encoder.transform(X)
 
         # Sparse related transformations
         # Not all sparse format support index sorting
         if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'):
             X.sort_indices()
 
-        return sklearn.utils.check_array(
-            X,
-            force_all_finite=False,
-            accept_sparse='csr'
-        )
+        try:
+            X = sklearn.utils.check_array(
+                X,
+                force_all_finite=False,
+                accept_sparse='csr'
+            )
+        except Exception as e:
+            self.logger.exception(f"Conversion failed for input {X.dtypes} {X}"
+                                  "This means AutoPyTorch was not able to properly "
+                                  "Extract the dtypes of the provided input features. "
+                                  "Please try to manually cast it to a supported "
+                                  "numerical or categorical values.")
+            raise e
+        return X
 
     def _check_data(
         self,
@@ -231,6 +240,10 @@ def _check_data(
             # If entered here, we have a pandas dataframe
             X = typing.cast(pd.DataFrame, X)
 
+            # Handle objects if possible
+            if not X.select_dtypes(include='object').empty:
+                X = self.infer_objects(X)
+
             # Define the column to be encoded here as the feature validator is fitted once
             # per estimator
             enc_columns, _ = self._get_columns_to_encode(X)
@@ -245,6 +258,7 @@ def _check_data(
                                      )
             else:
                 self.column_order = column_order
+
             dtypes = [dtype.name for dtype in X.dtypes]
             if len(self.dtypes) > 0:
                 if self.dtypes != dtypes:
@@ -379,3 +393,96 @@ def numpy_array_to_pandas(
             pd.DataFrame
         """
         return pd.DataFrame(X).infer_objects().convert_dtypes()
+
+    def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        In case the input contains object columns, their type is inferred if possible
+
+        This has to be done once, so the test and train data are treated equally
+
+        Arguments:
+            X (pd.DataFrame):
+                data to be interpreted.
+
+        Returns:
+            pd.DataFrame
+        """
+        if hasattr(self, 'object_dtype_mapping'):
+            # Mypy does not process the has attr. This dict is defined below
+            for key, dtype in self.object_dtype_mapping.items():  # type: ignore[has-type]
+                if 'int' in dtype.name:
+                    # In the case train data was interpreted as int
+                    # and test data was interpreted as float, because of 0.0
+                    # for example, honor training data
+                    X[key] = X[key].applymap(np.int64)
+                else:
+                    try:
+                        X[key] = X[key].astype(dtype.name)
+                    except Exception as e:
+                        # Try inference if possible
+                        self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
+                        pass
+        else:
+            X = X.infer_objects()
+            for column in X.columns:
+                if not is_numeric_dtype(X[column]):
+                    X[column] = X[column].astype('category')
+            self.object_dtype_mapping = {column: X[column].dtype for column in X.columns}
+        self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
+        return X
+
+    def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        impute missing values before encoding,
+        remove once sklearn natively supports
+        it in ordinal encoding. Sklearn issue:
+        "https://github.com/scikit-learn/scikit-learn/issues/17123)"
+
+        Arguments:
+            X (pd.DataFrame):
+                data to be interpreted.
+
+        Returns:
+            pd.DataFrame
+        """
+
+        # To be on the safe side, map always to the same missing
+        # value per column
+        if not hasattr(self, 'dict_nancol_to_missing'):
+            self.dict_missing_value_per_col: typing.Dict[str, typing.Any] = {}
+
+        # First make sure that we do not alter the type of the column which cause:
+        # TypeError: '<' not supported between instances of 'int' and 'str'
+        # in the encoding
+        for column in self.enc_columns:
+            if X[column].isna().any():
+                if column not in self.dict_missing_value_per_col:
+                    try:
+                        float(X[column].dropna().values[0])
+                        can_cast_as_number = True
+                    except Exception:
+                        can_cast_as_number = False
+                    if can_cast_as_number:
+                        # In this case, we expect to have a number as category
+                        # it might be string, but its value represent a number
+                        missing_value: typing.Union[str, int] = '-1' if isinstance(X[column].dropna().values[0],
+                                                                                   str) else -1
+                    else:
+                        missing_value = 'Missing!'
+
+                    # Make sure this missing value is not seen before
+                    # Do this check for categorical columns
+                    # else modify the value
+                    if hasattr(X[column], 'cat'):
+                        while missing_value in X[column].cat.categories:
+                            if isinstance(missing_value, str):
+                                missing_value += '0'
+                            else:
+                                missing_value += missing_value
+                    self.dict_missing_value_per_col[column] = missing_value
+
+                # Convert the frame in place
+                X[column].cat.add_categories([self.dict_missing_value_per_col[column]],
+                                             inplace=True)
+                X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True)
+        return X
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -134,6 +134,12 @@ def estimator_supports_iterative_fit(self) -> bool:  # pylint: disable=R0201
     def get_additional_run_info(self) -> None:  # pylint: disable=R0201
         return None
 
+    def get_pipeline_representation(self) -> Dict[str, str]:
+        return {
+            'Preprocessing': 'None',
+            'Estimator': 'Dummy',
+        }
+
     @staticmethod
     def get_default_pipeline_options() -> Dict[str, Any]:
         return {'budget_type': 'epochs',
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
@@ -1,6 +1,7 @@
 import os
 import pickle
 import sys
+import unittest
 
 import numpy as np
 
@@ -21,6 +22,7 @@
     CrossValTypes,
     HoldoutValTypes,
 )
+from autoPyTorch.optimizer.smbo import AutoMLSMBO
 
 
 # Fixtures
@@ -344,3 +346,45 @@ def test_tabular_regression(openml_name, resampling_strategy, backend):
         with open(dump_file, 'rb') as f:
             restored_estimator = pickle.load(f)
         restored_estimator.predict(X_test)
+
+
+@pytest.mark.parametrize('openml_id', (
+    1590,  # Adult to test NaN in categorical columns
+))
+def test_tabular_input_support(openml_id, backend):
+    """
+    Make sure we can process inputs with NaN in categorical and Object columns
+    when the later is possible
+    """
+
+    # Get the data and check that contents of data-manager make sense
+    X, y = sklearn.datasets.fetch_openml(
+        data_id=int(openml_id),
+        return_X_y=True, as_frame=True
+    )
+
+    # Make sure we are robust against objects
+    X[X.columns[0]] = X[X.columns[0]].astype(object)
+
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+        X, y, random_state=1)
+    # Search for a good configuration
+    estimator = TabularClassificationTask(
+        backend=backend,
+        resampling_strategy=HoldoutValTypes.holdout_validation,
+        ensemble_size=0,
+    )
+
+    estimator._do_dummy_prediction = unittest.mock.MagicMock()
+
+    with unittest.mock.patch.object(AutoMLSMBO, 'run_smbo') as AutoMLSMBOMock:
+        AutoMLSMBOMock.return_value = ({}, {}, 'epochs')
+        estimator.search(
+            X_train=X_train, y_train=y_train,
+            X_test=X_test, y_test=y_test,
+            optimize_metric='accuracy',
+            total_walltime_limit=150,
+            func_eval_time_limit=50,
+            traditional_per_total_budget=0,
+            load_models=False,
+        )
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
@@ -236,7 +236,7 @@ def test_featurevalidator_categorical_nan(input_data_featuretest):
     validator.fit(input_data_featuretest)
     transformed_X = validator.transform(input_data_featuretest)
     assert any(pd.isna(input_data_featuretest))
-    assert any((-1 in categories) or ('-1' in categories) for categories in
+    assert any((-1 in categories) or ('-1' in categories) or ('Missing!' in categories) for categories in
                validator.encoder.named_transformers_['encoder'].categories_)
     assert np.shape(input_data_featuretest) == np.shape(transformed_X)
     assert np.issubdtype(transformed_X.dtype, np.number)
@@ -328,10 +328,6 @@ def test_features_unsupported_calls_are_raised():
         validator.fit(
             pd.DataFrame({'datetime': [pd.Timestamp('20180310')]})
         )
-    with pytest.raises(ValueError, match="has invalid type object"):
-        validator.fit(
-            pd.DataFrame({'string': [TabularFeatureValidator()]})
-        )
     with pytest.raises(ValueError, match=r"AutoPyTorch only supports.*yet, the provided input"):
         validator.fit({'input1': 1, 'input2': 2})
     with pytest.raises(ValueError, match=r"has unsupported dtype string"):