automl
diff --git a/‎autosklearn/automl.py
Lines changed: 143 additions & 8 deletions b/‎autosklearn/automl.py
Lines changed: 143 additions & 8 deletions
diff --git a/‎autosklearn/estimators.py
Lines changed: 63 additions & 2 deletions b/‎autosklearn/estimators.py
Lines changed: 63 additions & 2 deletions
diff --git a/‎examples/20_basic/example_classification.py
Lines changed: 3 additions & 1 deletion b/‎examples/20_basic/example_classification.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/20_basic/example_multilabel_classification.py
Lines changed: 2 additions & 1 deletion b/‎examples/20_basic/example_multilabel_classification.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/20_basic/example_multioutput_regression.py
Lines changed: 2 additions & 1 deletion b/‎examples/20_basic/example_multioutput_regression.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/20_basic/example_regression.py
Lines changed: 3 additions & 1 deletion b/‎examples/20_basic/example_regression.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/40_advanced/example_get_pipeline_components.py
Lines changed: 13 additions & 4 deletions b/‎examples/40_advanced/example_get_pipeline_components.py
Lines changed: 13 additions & 4 deletions
diff --git a/‎examples/40_advanced/example_interpretable_models.py
Lines changed: 3 additions & 1 deletion b/‎examples/40_advanced/example_interpretable_models.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/60_search/example_random_search.py
Lines changed: 3 additions & 2 deletions b/‎examples/60_search/example_random_search.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/60_search/example_sequential.py
Lines changed: 2 additions & 1 deletion b/‎examples/60_search/example_sequential.py
Lines changed: 2 additions & 1 deletion
@@ -1836,16 +1836,151 @@ def get_models_with_weights(self):
 
         return self.ensemble_.get_models_with_weights(self.models_)
 
-    def show_models(self):
-        models_with_weights = self.get_models_with_weights()
+    def show_models(self) -> Dict[int, Any]:
+        """ Returns a dictionary containing dictionaries of ensemble models.
 
-        with io.StringIO() as sio:
-            sio.write("[")
-            for weight, model in models_with_weights:
-                sio.write("(%f, %s),\n" % (weight, model))
-            sio.write("]")
+        Each model in the ensemble can be accessed by giving its ``model_id`` as key.
 
-            return sio.getvalue()
+        A model dictionary contains the following:
+
+        * ``"model_id"`` - The id given to a model by ``autosklearn``.
+        * ``"rank"`` - The rank of the model based on it's ``"cost"``.
+        * ``"cost"`` - The loss of the model on the validation set.
+        * ``"ensemble_weight"`` - The weight given to the model in the ensemble.
+        * ``"voting_model"`` - The ``cv_voting_ensemble`` model (for 'cv' resampling).
+        * ``"estimators"`` - List of models (dicts) in ``cv_voting_ensemble`` (for 'cv' resampling).
+        * ``"data_preprocessor"`` - The preprocessor used on the data.
+        * ``"balancing"`` - The balancing used on the data (for classification).
+        * ``"feature_preprocessor"`` - The preprocessor for features types.
+        * ``"classifier"`` or ``"regressor"`` - The autosklearn wrapped classifier or regressor.
+        * ``"sklearn_classifier"`` or ``"sklearn_regressor"`` - The sklearn classifier or regressor.
+
+        **Example**
+
+        .. code-block:: python
+
+            import sklearn.datasets
+            import sklearn.metrics
+            import autosklearn.regression
+
+            X, y = sklearn.datasets.load_diabetes(return_X_y=True)
+
+            automl = autosklearn.regression.AutoSklearnRegressor(
+                time_left_for_this_task=120
+                )
+            automl.fit(X_train, y_train, dataset_name='diabetes')
+
+            ensemble_dict = automl.show_models()
+            print(ensemble_dict)
+
+        Output:
+
+        .. code-block:: text
+
+            {
+                25: {'model_id': 25.0,
+                     'rank': 1,
+                     'cost': 0.43667876507897496,
+                     'ensemble_weight': 0.38,
+                     'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
+                     'feature_preprocessor': <autosklearn.pipeline.components....>,
+                     'regressor': <autosklearn.pipeline.components.regression....>,
+                     'sklearn_regressor': SGDRegressor(alpha=0.0006517033225329654,...)
+                    },
+                6: {'model_id': 6.0,
+                    'rank': 2,
+                    'cost': 0.4550418898836528,
+                    'ensemble_weight': 0.3,
+                    'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
+                    'feature_preprocessor': <autosklearn.pipeline.components....>,
+                    'regressor': <autosklearn.pipeline.components.regression....>,
+                    'sklearn_regressor': ARDRegression(alpha_1=0.0003701926442639788,...)
+                    }...
+            }
+
+        Returns
+        -------
+        Dict(int, Any) : dictionary of length = number of models in the ensemble
+            A dictionary of models in the ensemble, where ``model_id`` is the key.
+
+        """
+
+        ensemble_dict = {}
+
+        def has_key(rv, key):
+            return rv.additional_info and key in rv.additional_info
+
+        table_dict = {}
+        for rkey, rval in self.runhistory_.data.items():
+            if has_key(rval, 'num_run'):
+                model_id = rval.additional_info['num_run']
+                table_dict[model_id] = {
+                        'model_id': model_id,
+                        'cost': rval.cost
+                        }
+
+        # Checking if the dictionary is empty
+        if not table_dict:
+            raise RuntimeError('No model found. Try increasing \'time_left_for_this_task\'.')
+
+        for i, weight in enumerate(self.ensemble_.weights_):
+            (_, model_id, _) = self.ensemble_.identifiers_[i]
+            table_dict[model_id]['ensemble_weight'] = weight
+
+        table = pd.DataFrame.from_dict(table_dict, orient='index')
+        table.sort_values(by='cost', inplace=True)
+
+        # Checking which resampling strategy is chosen and selecting the appropriate models
+        is_cv = (self._resampling_strategy == "cv")
+        models = self.cv_models_ if is_cv else self.models_
+
+        rank = 1  # Initializing rank for the first model
+        for (_, model_id, _), model in models.items():
+            model_dict = {}  # Declaring model dictionary
+
+            # Inserting model_id, rank, cost and ensemble weight
+            model_dict['model_id'] = table.loc[model_id]['model_id'].astype(int)
+            model_dict['rank'] = rank
+            model_dict['cost'] = table.loc[model_id]['cost']
+            model_dict['ensemble_weight'] = table.loc[model_id]['ensemble_weight']
+            rank += 1  # Incrementing rank by 1 for the next model
+
+            # The steps in the models pipeline are as follows:
+            # 'data_preprocessor': DataPreprocessor,
+            # 'balancing': Balancing,
+            # 'feature_preprocessor': FeaturePreprocessorChoice,
+            # 'classifier'/'regressor': ClassifierChoice/RegressorChoice (autosklearn wrapped model)
+
+            # For 'cv' (cross validation) strategy
+            if is_cv:
+                # Voting model created by cross validation
+                cv_voting_ensemble = model
+                model_dict['voting_model'] = cv_voting_ensemble
+
+                # List of models, each trained on one cv fold
+                cv_models = []
+                for cv_model in cv_voting_ensemble.estimators_:
+                    estimator = dict(cv_model.steps)
+
+                    # Adding sklearn model to the model dictionary
+                    model_type, autosklearn_wrapped_model = cv_model.steps[-1]
+                    estimator[f'sklearn_{model_type}'] = autosklearn_wrapped_model.choice.estimator
+                    cv_models.append(estimator)
+                model_dict['estimators'] = cv_models
+
+            # For any other strategy
+            else:
+                steps = dict(model.steps)
+                model_dict.update(steps)
+
+                # Adding sklearn model to the model dictionary
+                model_type, autosklearn_wrapped_model = model.steps[-1]
+                model_dict[f'sklearn_{model_type}'] = autosklearn_wrapped_model.choice.estimator
+
+            # Insterting model_dict in the ensemble dictionary
+            ensemble_dict[model_id] = model_dict
+
+        return ensemble_dict
 
     def _create_search_space(
         self,
 
@@ -537,13 +537,74 @@ def score(self, X, y):
         return self.automl_.score(X, y)
 
     def show_models(self):
-        """Return a representation of the final ensemble found by auto-sklearn.
+        """ Returns a dictionary containing dictionaries of ensemble models.
+
+        Each model in the ensemble can be accessed by giving its ``model_id`` as key.
+
+        A model dictionary contains the following:
+
+        * ``"model_id"`` - The id given to a model by ``autosklearn``.
+        * ``"rank"`` - The rank of the model based on it's ``"cost"``.
+        * ``"cost"`` - The loss of the model on the validation set.
+        * ``"ensemble_weight"`` - The weight given to the model in the ensemble.
+        * ``"voting_model"`` - The ``cv_voting_ensemble`` model (for 'cv' resampling).
+        * ``"estimators"`` - List of models (dicts) in ``cv_voting_ensemble`` (for 'cv' resampling).
+        * ``"data_preprocessor"`` - The preprocessor used on the data.
+        * ``"balancing"`` - The balancing used on the data (for classification).
+        * ``"feature_preprocessor"`` - The preprocessor for features types.
+        * ``"classifier"`` or ``"regressor"`` - The autosklearn wrapped classifier or regressor.
+        * ``"sklearn_classifier"`` or ``"sklearn_regressor"`` - The sklearn classifier or regressor.
+
+        **Example**
+
+        .. code-block:: python
+
+            import sklearn.datasets
+            import sklearn.metrics
+            import autosklearn.regression
+
+            X, y = sklearn.datasets.load_diabetes(return_X_y=True)
+
+            automl = autosklearn.regression.AutoSklearnRegressor(
+                time_left_for_this_task=120
+                )
+            automl.fit(X_train, y_train, dataset_name='diabetes')
+
+            ensemble_dict = automl.show_models()
+            print(ensemble_dict)
+
+        Output:
+
+        .. code-block:: text
+
+            {
+                25: {'model_id': 25.0,
+                     'rank': 1,
+                     'cost': 0.43667876507897496,
+                     'ensemble_weight': 0.38,
+                     'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
+                     'feature_preprocessor': <autosklearn.pipeline.components....>,
+                     'regressor': <autosklearn.pipeline.components.regression....>,
+                     'sklearn_regressor': SGDRegressor(alpha=0.0006517033225329654,...)
+                    },
+                6: {'model_id': 6.0,
+                    'rank': 2,
+                    'cost': 0.4550418898836528,
+                    'ensemble_weight': 0.3,
+                    'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing....>,
+                    'feature_preprocessor': <autosklearn.pipeline.components....>,
+                    'regressor': <autosklearn.pipeline.components.regression....>,
+                    'sklearn_regressor': ARDRegression(alpha_1=0.0003701926442639788,...)
+                    }...
+            }
 
         Returns
         -------
-        str
+        Dict(int, Any) : dictionary of length = number of models in the ensemble
+            A dictionary of models in the ensemble, where ``model_id`` is the key.
 
         """
+
         return self.automl_.show_models()
 
     def get_models_with_weights(self):
 
@@ -7,6 +7,8 @@
 The following example shows how to fit a simple classification model with
 *auto-sklearn*.
 """
+from pprint import pprint
+
 import sklearn.datasets
 import sklearn.metrics
 
@@ -42,7 +44,7 @@
 # Print the final ensemble constructed by auto-sklearn
 # ====================================================
 
-print(automl.show_models())
+pprint(automl.show_models(), indent=4)
 
 ###########################################################################
 # Get the Score of the final ensemble
 
@@ -8,6 +8,7 @@
 `here <https://scikit-learn.org/stable/modules/multiclass.html>`_.
 """
 import numpy as np
+from pprint import pprint
 
 import sklearn.datasets
 import sklearn.metrics
@@ -65,7 +66,7 @@
 # Print the final ensemble constructed by auto-sklearn
 # ====================================================
 
-print(automl.show_models())
+pprint(automl.show_models(), indent=4)
 
 ############################################################################
 # Print statistics about the auto-sklearn run
 
@@ -8,6 +8,7 @@
 *auto-sklearn*.
 """
 import numpy as numpy
+from pprint import pprint
 
 from sklearn.datasets import make_regression
 from sklearn.metrics import r2_score
@@ -46,7 +47,7 @@
 # Print the final ensemble constructed by auto-sklearn
 # ====================================================
 
-print(automl.show_models())
+pprint(automl.show_models(), indent=4)
 
 ###########################################################################
 # Get the Score of the final ensemble
 
@@ -7,6 +7,8 @@
 The following example shows how to fit a simple regression model with
 *auto-sklearn*.
 """
+from pprint import pprint
+
 import sklearn.datasets
 import sklearn.metrics
 
@@ -43,7 +45,7 @@
 # Print the final ensemble constructed by auto-sklearn
 # ====================================================
 
-print(automl.show_models())
+pprint(automl.show_models(), indent=4)
 
 #####################################
 # Get the Score of the final ensemble
 
@@ -14,6 +14,8 @@
 the sklearn models. This example illustrates how to interact
 with the sklearn components directly, in this case a PCA preprocessor.
 """
+from pprint import pprint
+
 import sklearn.datasets
 import sklearn.metrics
 
@@ -62,10 +64,17 @@
 # `Ensemble Selection <https://www.cs.cornell.edu/~alexn/papers/shotgun.icml04.revised.rev2.pdf>`_
 # to construct ensembles in a post-hoc fashion. The ensemble is a linear
 # weighting of all models constructed during the hyperparameter optimization.
-# This prints the final ensemble. It is a list of tuples, each tuple being
-# the model weight in the ensemble and the model itself.
-
-print(automl.show_models())
+# This prints the final ensemble. It is a dictionary where ``model_id`` of
+# each model is a key, and value is a dictionary containing information
+# of that model. A model's dict contains its ``'model_id'``, ``'rank'``,
+# ``'cost'``, ``'ensemble_weight'``, and the model itself. The model is
+# given by the ``'data_preprocessor'``, ``'feature_preprocessor'``,
+# ``'regressor'/'classifier'`` and ``'sklearn_regressor'/'sklearn_classifier'``
+# entries. But for the ``'cv'`` resampling strategy, the same for each cv
+# model is stored in the ``'estimators'`` list in the dict, along with the
+# ``'voting_model'``.
+
+pprint(automl.show_models(), indent=4)
 
 ###########################################################################
 # Report statistics about the search
 
@@ -7,6 +7,8 @@
 The following example shows how to inspect the models which *auto-sklearn*
 optimizes over and how to restrict them to an interpretable subset.
 """
+from pprint import pprint
+
 import autosklearn.classification
 import sklearn.datasets
 import sklearn.metrics
@@ -70,7 +72,7 @@
 # Print the final ensemble constructed by auto-sklearn
 # ====================================================
 
-print(automl.show_models())
+pprint(automl.show_models(), indent=4)
 
 ###########################################################################
 # Get the Score of the final ensemble
 
@@ -12,6 +12,7 @@
 as yet another alternative optimizatino strategy.
 Both examples are intended to show how the optimization strategy in *auto-sklearn* can be adapted.
 """  # noqa (links are too long)
+from pprint import pprint
 
 import sklearn.model_selection
 import sklearn.datasets
@@ -75,7 +76,7 @@ def get_roar_object_callback(
 print('#' * 80)
 print('Results for ROAR.')
 # Print the final ensemble constructed by auto-sklearn via ROAR.
-print(automl.show_models())
+pprint(automl.show_models(), indent=4)
 predictions = automl.predict(X_test)
 # Print statistics about the auto-sklearn run such as number of
 # iterations, number of models failed with a time out.
@@ -129,7 +130,7 @@ def get_random_search_object_callback(
 print('Results for random search.')
 
 # Print the final ensemble constructed by auto-sklearn via random search.
-print(automl.show_models())
+pprint(automl.show_models(), indent=4)
 
 # Print statistics about the auto-sklearn run such as number of
 # iterations, number of models failed with a time out.
 
@@ -8,6 +8,7 @@
 sequentially. The example below shows how to first fit the models and build the
 ensembles afterwards.
 """
+from pprint import pprint
 
 import sklearn.model_selection
 import sklearn.datasets
@@ -48,7 +49,7 @@
 # Print the final ensemble constructed by auto-sklearn
 # ====================================================
 
-print(automl.show_models())
+pprint(automl.show_models(), indent=4)
 
 ############################################################################
 # Get the Score of the final ensemble