Fix the ensemble_size == 0 error in automl.py (#1369)

bkpcoding · eddiebergman · eddiebergman · commit b90d61d69bc6 · 2022-08-18T20:08:48.000+02:00
* Fix the ensemble == 0 error in fit_ensemble and show_models function by adding a valueError to the former and giving a warning and returning empty dictionary in the latter

* Update automl.py

* Two tests for ensemble_size == 0 cases

Added two tests to check if the automl.fit_ensemble() raises error when ensemble_size == 0 and if show_models() returns empty dictionary when ensemble_size == 0

* Update automl.py

* Update test_automl.py

Test for checking if the show_models() functions raise an error if models are not fitted.

* Update automl.py

Add a function __sklearn_is_fitted__() which returns the boolean value of self.fitted(). And add the check for model fitting in show_models() function.

* Update autosklearn/automl.py

* Formatting changes to clear all the pre-commit tests

Co-authored-by: Eddie Bergman &lt;eddiebergmanhs@gmail.com&gt;
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -13,6 +13,7 @@
 import time
 import unittest.mock
 import uuid
+import warnings
 
 import dask
 import dask.distributed
@@ -302,6 +303,7 @@ def __init__(
 
         # The ensemble performance history through time
         self.ensemble_performance_history = []
+        self.fitted = False
 
         # Single core, local runs should use fork
         # to prevent the __main__ requirements in
@@ -348,7 +350,7 @@ def _create_dask_client(self):
                 processes=False,
                 threads_per_worker=1,
                 # We use the temporal directory to save the
-                # dask workers, because deleting workers
+                # dask workers, because deleting workers takes
                 # more time than deleting backend directories
                 # This prevent an error saying that the worker
                 # file was deleted, so the client could not close
@@ -562,7 +564,7 @@ def fit(
         #   "multiclass" be mean either REGRESSION or MULTICLASS_CLASSIFICATION,
         #   and so this is where the subclasses are used to determine which.
         #   However, this could also be deduced from the `is_classification`
-        #   paramaeter.
+        #   parameter.
         #
         #   In the future, there is little need for the subclasses of `AutoML`
         #   and no need for the `task` parameter. The extra functionality
@@ -1068,9 +1070,13 @@ def fit(
             self._logger.info("Finished loading models...")
 
         self._fit_cleanup()
+        self.fitted = True
 
         return self
 
+    def __sklearn_is_fitted__(self) -> bool:
+        return self.fitted
+
     def _fit_cleanup(self):
         self._logger.info("Closing the dask infrastructure")
         self._close_dask_client()
@@ -1481,6 +1487,10 @@ def fit_ensemble(
         ensemble_nbest=None,
         ensemble_size=None,
     ):
+        # check for the case when ensemble_size is less than 0
+        if not ensemble_size > 0:
+            raise ValueError("ensemble_size must be greater than 0 for fit_ensemble")
+
         # AutoSklearn does not handle sparse y for now
         y = convert_if_sparse(y)
 
@@ -1971,9 +1981,21 @@ def show_models(self) -> Dict[int, Any]:
         -------
         Dict(int, Any) : dictionary of length = number of models in the ensemble
             A dictionary of models in the ensemble, where ``model_id`` is the key.
-
         """  # noqa: E501
         ensemble_dict = {}
+        # check for condition whether autosklearn is fitted if not raise runtime error
+        if not self.__sklearn_is_fitted__():
+            raise RuntimeError("AutoSklearn has not been fitted")
+
+        # check for ensemble_size == 0
+        if self._ensemble_size == 0:
+            warnings.warn("No models in the ensemble. Kindly check the ensemble size.")
+            return ensemble_dict
+
+        # check for condition when ensemble_size > 0 but there is no ensemble to load
+        if self.ensemble_ is None:
+            warnings.warn("No ensemble found. Returning empty dictionary.")
+            return ensemble_dict
 
         def has_key(rv, key):
             return rv.additional_info and key in rv.additional_info
diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py
@@ -88,6 +88,48 @@ def test_fit(dask_client):
     del automl
 
 
+def test_ensemble_size_zero():
+    """Test if automl.fit_ensemble raises error when ensemble_size == 0"""
+    X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
+    automl = autosklearn.automl.AutoML(
+        seed=0,
+        time_left_for_this_task=30,
+        per_run_time_limit=5,
+        metric=accuracy,
+        ensemble_size=0,
+    )
+    automl.fit(X_train, Y_train, task=MULTICLASS_CLASSIFICATION)
+    with pytest.raises(ValueError):
+        automl.fit_ensemble(Y_test, ensemble_size=0)
+
+
+def test_empty_dict_in_show_models():
+    """Test if show_models() returns empty dictionary when ensemble_size == 0"""
+    X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
+    automl = autosklearn.automl.AutoMLClassifier(
+        seed=0,
+        time_left_for_this_task=30,
+        per_run_time_limit=5,
+        metric=accuracy,
+        ensemble_size=0,
+    )
+    automl.fit(X_train, Y_train)
+    assert automl.show_models() == {}
+
+
+def test_fitted_models_in_show_models():
+    X_train, Y_train, X_test, Y_test = putil.get_dataset("iris")
+    automl = autosklearn.automl.AutoMLClassifier(
+        seed=0,
+        time_left_for_this_task=30,
+        per_run_time_limit=5,
+        metric=accuracy,
+        ensemble_size=0,
+    )
+    with pytest.raises(RuntimeError, match="AutoSklearn has not been fitted"):
+        automl.show_models()
+
+
 def test_fit_roar(dask_client_single_worker):
     def get_roar_object_callback(
         scenario_dict, seed, ta, ta_kwargs, dask_client, n_jobs, **kwargs