[ADD] Coverage calculation

franchuterivera · franchuterivera · commit 93ce1eb42dc6 · 2021-05-18T21:55:17.000+02:00
diff --git a/.codecov.yml b/.codecov.yml
@@ -0,0 +1,42 @@
+#see https://github.com/codecov/support/wiki/Codecov-Yaml
+codecov:
+  notify:
+    require_ci_to_pass: yes
+
+coverage:
+  precision: 2  # 2 = xx.xx%, 0 = xx%
+  round: nearest # how coverage is rounded: down/up/nearest
+  range: 10...90 # custom range of coverage colors from red -> yellow -> green
+  status:
+    # https://codecov.readme.io/v1.0/docs/commit-status
+    project:
+      default:
+        against: auto
+        target: 70% # specify the target coverage for each commit status
+        threshold: 50% # allow this little decrease on project
+        # https://github.com/codecov/support/wiki/Filtering-Branches
+        # branches: master
+        if_ci_failed: error
+    # https://github.com/codecov/support/wiki/Patch-Status
+    patch:
+      default:
+        against: auto
+        target: 30% # specify the target "X%" coverage to hit
+        threshold: 50% # allow this much decrease on patch
+    changes: false
+
+parsers:
+  gcov:
+    branch_detection:
+      conditional: true
+      loop: true
+      macro: false
+      method: false
+  javascript:
+    enable_partials: false
+
+comment:
+  layout: header, diff
+  require_changes: false
+  behavior: default  # update if exists else create new
+  branches: *
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -9,7 +9,10 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8]
-      fail-fast:  false
+        include:
+          - python-version: 3.8
+            code-cov: true
+      fail-fast: false
       max-parallel: 2
 
     steps:
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -34,6 +34,7 @@
     STRING_TO_OUTPUT_TYPES,
     STRING_TO_TASK_TYPES,
 )
+from autoPyTorch.data.base_validator import BaseInputValidator
 from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes
 from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
@@ -189,6 +190,8 @@ def __init__(
 
         self._dask_client = None
 
+        self.InputValidator: Optional[BaseInputValidator] = None
+
         self.search_space_updates = search_space_updates
         if search_space_updates is not None:
             if not isinstance(self.search_space_updates,
@@ -253,12 +256,6 @@ def get_pipeline_options(self) -> dict:
         """
         return self.pipeline_options
 
-    # def set_search_space(self, search_space: ConfigurationSpace) -> None:
-    #     """
-    #     Update the search space.
-    #     """
-    #     raise NotImplementedError
-    #
     def get_search_space(self, dataset: BaseDataset = None) -> ConfigurationSpace:
         """
         Returns the current search space as ConfigurationSpace object.
@@ -272,8 +269,8 @@ def get_search_space(self, dataset: BaseDataset = None) -> ConfigurationSpace:
                                            include=self.include_components,
                                            exclude=self.exclude_components,
                                            search_space_updates=self.search_space_updates)
-        raise Exception("No search space initialised and no dataset passed. "
-                        "Can't create default search space without the dataset")
+        raise ValueError("No search space initialised and no dataset passed. "
+                         "Can't create default search space without the dataset")
 
     def _get_logger(self, name: str) -> PicklableClientLogger:
         """
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -129,7 +129,10 @@ def __init__(
         if len(self.train_tensors) == 2 and self.train_tensors[1] is not None:
             self.output_type: str = type_of_target(self.train_tensors[1])
 
-            if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS:
+            if (
+                self.output_type in STRING_TO_OUTPUT_TYPES
+                and STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS
+            ):
                 self.output_shape = len(np.unique(self.train_tensors[1]))
             else:
                 self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1
diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
@@ -162,7 +162,10 @@ def stratified_k_fold_cross_validation(random_state: np.random.RandomState,
                                            indices: np.ndarray,
                                            **kwargs: Any
                                            ) -> List[Tuple[np.ndarray, np.ndarray]]:
-        cv = StratifiedKFold(n_splits=num_splits, random_state=random_state)
+
+        shuffle = kwargs.get('shuffle', True)
+        cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle,
+                             random_state=random_state if not shuffle else None)
         splits = list(cv.split(indices, kwargs["stratify"]))
         return splits
 
diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py
@@ -14,6 +14,7 @@
 
 import sklearn
 import sklearn.datasets
+from sklearn.base import BaseEstimator
 from sklearn.base import clone
 from sklearn.ensemble import VotingClassifier, VotingRegressor
 
@@ -26,6 +27,7 @@
     HoldoutValTypes,
 )
 from autoPyTorch.optimizer.smbo import AutoMLSMBO
+from autoPyTorch.pipeline.components.setup.traditional_ml.classifier_models import _classifiers
 from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy
 
 
@@ -183,9 +185,12 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl
     assert len(estimator.ensemble_.identifiers_) == len(estimator.ensemble_.weights_)
 
     y_pred = estimator.predict(X_test)
-
     assert np.shape(y_pred)[0] == np.shape(X_test)[0]
 
+    # Make sure that predict proba has the expected shape
+    probabilites = estimator.predict_proba(X_test)
+    assert np.shape(probabilites) == (np.shape(X_test)[0], 2)
+
     score = estimator.score(y_pred, y_test)
     assert 'accuracy' in score
 
@@ -203,6 +208,9 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl
             restored_estimator = pickle.load(f)
         restored_estimator.predict(X_test)
 
+    # Test refit on dummy data
+    estimator.refit(dataset=backend.load_datamanager())
+
 
 @pytest.mark.parametrize('openml_name', ("boston", ))
 @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function',
@@ -439,6 +447,12 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular):
     estimator._disable_file_output = []
     estimator._all_supported_metrics = False
 
+    original_memory_limit = estimator._memory_limit
+    estimator._memory_limit = 500
+    with pytest.raises(ValueError, match=r".*Dummy prediction failed with run state.*"):
+        estimator._do_dummy_prediction()
+
+    estimator._memory_limit = original_memory_limit
     estimator._do_dummy_prediction()
 
     # Ensure that the dummy predictions are not in the current working
@@ -464,3 +478,78 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular):
     estimator._clean_logger()
 
     del estimator
+
+
+# TODO: Make faster when https://github.com/automl/Auto-PyTorch/pull/223 is incorporated
+@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True)
+def test_do_traditional_pipeline(fit_dictionary_tabular):
+    backend = fit_dictionary_tabular['backend']
+    estimator = TabularClassificationTask(
+        backend=backend,
+        resampling_strategy=HoldoutValTypes.holdout_validation,
+        ensemble_size=0,
+    )
+
+    # Setup pre-requisites normally set by search()
+    estimator._create_dask_client()
+    estimator._metric = accuracy
+    estimator._logger = estimator._get_logger('test')
+    estimator._memory_limit = 5000
+    estimator._time_for_task = 60
+    estimator._disable_file_output = []
+    estimator._all_supported_metrics = False
+
+    estimator._do_traditional_prediction(time_left=60, func_eval_time_limit_secs=30)
+
+    # The models should not be on the current directory
+    assert not os.path.exists(os.path.join(os.getcwd(), '.autoPyTorch'))
+
+    # Then we should have fitted 5 classifiers
+    # Maybe some of them fail (unlikely, but we do not control external API)
+    # but we want to make this test robust
+    at_least_one_model_checked = False
+    for i in range(2, 7):
+        pred_path = os.path.join(
+            backend.temporary_directory, '.autoPyTorch', 'runs', f"1_{i}_50.0",
+            f"predictions_ensemble_1_{i}_50.0.npy"
+        )
+        assert os.path.exists(pred_path)
+
+        model_path = os.path.join(backend.temporary_directory,
+                                  '.autoPyTorch',
+                                  'runs', f"1_{i}_50.0",
+                                  f"1.{i}.50.0.model")
+
+        # Make sure the dummy model complies with scikit learn
+        # get/set params
+        assert os.path.exists(model_path)
+        with open(model_path, 'rb') as model_handler:
+            model = pickle.load(model_handler)
+        clone(model)
+        assert model.config == list(_classifiers.keys())[i - 2]
+        at_least_one_model_checked = True
+    if not at_least_one_model_checked:
+        pytest.fail("Not even one single traditional pipeline was fitted")
+
+    estimator._close_dask_client()
+    estimator._clean_logger()
+
+    del estimator
+
+
+@pytest.mark.parametrize("api_type", [TabularClassificationTask, TabularRegressionTask])
+def test_unsupported_msg(api_type):
+    api = api_type()
+    with pytest.raises(ValueError, match=r".*Dataset is incompatible for the given task.*"):
+        api._get_required_dataset_properties('dummy')
+    with pytest.raises(ValueError, match=r".*is only supported after calling search. Kindly .*"):
+        api.predict(np.ones((10, 10)))
+
+
+@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True)
+@pytest.mark.parametrize("api_type", [TabularClassificationTask, TabularRegressionTask])
+def test_build_pipeline(api_type, fit_dictionary_tabular):
+    api = api_type()
+    pipeline = api.build_pipeline(fit_dictionary_tabular['dataset_properties'])
+    assert isinstance(pipeline, BaseEstimator)
+    assert len(pipeline.steps) > 0
diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py
@@ -0,0 +1,70 @@
+import os
+import pathlib
+import pickle
+import sys
+import unittest
+from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_function
+
+import numpy as np
+
+import pandas as pd
+
+import pytest
+
+
+import sklearn
+import sklearn.datasets
+from sklearn.base import clone
+from sklearn.ensemble import VotingClassifier, VotingRegressor
+
+from smac.runhistory.runhistory import RunHistory
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+from autoPyTorch.api.tabular_regression import TabularRegressionTask
+from autoPyTorch.datasets.resampling_strategy import (
+    CrossValTypes,
+    HoldoutValTypes,
+)
+from autoPyTorch.optimizer.smbo import AutoMLSMBO
+from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy
+
+from autoPyTorch.api.base_task import BaseTask
+
+
+# ====
+# Test
+# ====
+@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True)
+def test_nonsupported_arguments(fit_dictionary_tabular):
+    with pytest.raises(ValueError, match=r".*Expected search space updates to be of instance.*"):
+        api = BaseTask(search_space_updates='None')
+
+    api = BaseTask()
+    with pytest.raises(ValueError, match=r".*Invalid configuration arguments given.*"):
+        api.set_pipeline_config(unsupported=True)
+    with pytest.raises(ValueError, match=r".*No search space initialised and no dataset.*"):
+        api.get_search_space()
+    api.resampling_strategy = None
+    with pytest.raises(ValueError, match=r".*Resampling strategy is needed to determine.*"):
+        api._load_models()
+    api.resampling_strategy = unittest.mock.MagicMock()
+    with pytest.raises(ValueError, match=r".*Providing a metric to AutoPytorch is required.*"):
+        api._load_models()
+    api.ensemble_ = unittest.mock.MagicMock()
+    with pytest.raises(ValueError, match=r".*No metric found. Either fit/search has not been.*"):
+        api.score(np.ones(10), np.ones(10))
+    api._metric = unittest.mock.MagicMock()
+    with pytest.raises(ValueError, match=r".*No valid model found in run history.*"):
+        api._load_models()
+    dataset = fit_dictionary_tabular['backend'].load_datamanager()
+    with pytest.raises(ValueError, match=r".*Incompatible dataset entered for current task.*"):
+        api._search('accuracy', dataset)
+
+    def returnfalse():
+        return False
+
+    api._load_models = returnfalse
+    with pytest.raises(ValueError, match=r".*No ensemble found. Either fit has not yet.*"):
+        api.predict(np.ones((10, 10)))
+    with pytest.raises(ValueError, match=r".*No ensemble found. Either fit has not yet.*"):
+        api.predict(np.ones((10, 10)))
diff --git a/test/test_datasets/test_resampling_strategies.py b/test/test_datasets/test_resampling_strategies.py
@@ -0,0 +1,42 @@
+import numpy as np
+
+from autoPyTorch.datasets.resampling_strategy import CrossValFuncs, HoldOutFuncs
+
+
+def test_holdoutfuncs():
+    split = HoldOutFuncs()
+    X = np.arange(10)
+    y = np.ones(10)
+    # Create a minority class
+    y[:2] = 0
+    train, val = split.holdout_validation(0, 0.5, X, shuffle=False)
+    assert len(train) == len(val) == 5
+
+    # No shuffling
+    np.testing.assert_array_equal(X, np.arange(10))
+
+    # Make sure the stratified version splits the minority class
+    train, val = split.stratified_holdout_validation(0, 0.5, X, stratify=y)
+    assert 0 in y[val]
+    assert 0 in y[train]
+
+
+def test_crossvalfuncs():
+    split = CrossValFuncs()
+    X = np.arange(100)
+    y = np.ones(100)
+    # Create a minority class
+    y[:11] = 0
+    splits = split.shuffle_split_cross_validation(0, 10, X)
+    assert len(splits) == 10
+    assert all([len(s[1]) == 10 for s in splits])
+
+    # Make sure the stratified version splits the minority class
+    splits = split.stratified_shuffle_split_cross_validation(0, 10, X, stratify=y)
+    assert len(splits) == 10
+    assert all([0 in y[s[1]] for s in splits])
+
+    #
+    splits = split.stratified_k_fold_cross_validation(0, 10, X, stratify=y)
+    assert len(splits) == 10
+    assert all([0 in y[s[1]] for s in splits])
diff --git a/test/test_datasets/test_tabular_dataset.py b/test/test_datasets/test_tabular_dataset.py
@@ -1,5 +1,8 @@
+import numpy as np
+
 import pytest
 
+from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
 
@@ -38,3 +41,8 @@ def test_get_dataset_properties(backend, fit_dictionary_tabular):
     assert datamanager.train_tensors[0].shape == fit_dictionary_tabular['X_train'].shape
     assert datamanager.train_tensors[1].shape == fit_dictionary_tabular['y_train'].shape
     assert datamanager.task_type == 'tabular_classification'
+
+
+def test_not_supported():
+    with pytest.raises(ValueError, match=r".*A feature validator is required to build.*"):
+        TabularDataset(np.ones(10), np.ones(10))