Skip to content

Commit 907b537

Browse files
committed
[FIX] Tests after rebase of reg_cocktails (#359)
* update requirements * update requirements * resolve remaining conflicts and fix flake and mypy * Fix remaining tests and examples * fix failing checks * fix flake
1 parent c310ef6 commit 907b537

38 files changed

+308
-1052
lines changed

autoPyTorch/api/base_task.py

Lines changed: 39 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -900,18 +900,15 @@ def run_traditional_ml(
900900
learning algorithm runs over the time limit.
901901
"""
902902
assert self._logger is not None # for mypy compliancy
903-
if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
904-
self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
905-
else:
906-
traditional_task_name = 'runTraditional'
907-
self._stopwatch.start_task(traditional_task_name)
908-
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
909-
time_for_traditional = int(runtime_limit - elapsed_time)
910-
self._do_traditional_prediction(
911-
func_eval_time_limit_secs=func_eval_time_limit_secs,
912-
time_left=time_for_traditional,
913-
)
914-
self._stopwatch.stop_task(traditional_task_name)
903+
traditional_task_name = 'runTraditional'
904+
self._stopwatch.start_task(traditional_task_name)
905+
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
906+
time_for_traditional = int(runtime_limit - elapsed_time)
907+
self._do_traditional_prediction(
908+
func_eval_time_limit_secs=func_eval_time_limit_secs,
909+
time_left=time_for_traditional,
910+
)
911+
self._stopwatch.stop_task(traditional_task_name)
915912

916913
def _search(
917914
self,
@@ -1281,22 +1278,7 @@ def _search(
12811278
self._logger.info("Starting Shutdown")
12821279

12831280
if proc_ensemble is not None:
1284-
self._results_manager.ensemble_performance_history = list(proc_ensemble.history)
1285-
1286-
if len(proc_ensemble.futures) > 0:
1287-
# Also add ensemble runs that did not finish within smac time
1288-
# and add them into the ensemble history
1289-
self._logger.info("Ensemble script still running, waiting for it to finish.")
1290-
result = proc_ensemble.futures.pop().result()
1291-
if result:
1292-
ensemble_history, _, _, _ = result
1293-
self._results_manager.ensemble_performance_history.extend(ensemble_history)
1294-
self._logger.info("Ensemble script finished, continue shutdown.")
1295-
1296-
# save the ensemble performance history file
1297-
if len(self.ensemble_performance_history) > 0:
1298-
pd.DataFrame(self.ensemble_performance_history).to_json(
1299-
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
1281+
self._collect_results_ensemble(proc_ensemble)
13001282

13011283
if load_models:
13021284
self._logger.info("Loading models...")
@@ -1564,7 +1546,7 @@ def fit_pipeline(
15641546
exclude=self.exclude_components,
15651547
search_space_updates=self.search_space_updates)
15661548
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
1567-
self._backend.replace_datamanager(dataset)
1549+
self._backend.save_datamanager(dataset)
15681550

15691551
if self._logger is None:
15701552
self._logger = self._get_logger(dataset.dataset_name)
@@ -1754,7 +1736,7 @@ def fit_ensemble(
17541736
ensemble_fit_task_name = 'EnsembleFit'
17551737
self._stopwatch.start_task(ensemble_fit_task_name)
17561738
if enable_traditional_pipeline:
1757-
if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task:
1739+
if func_eval_time_limit_secs > time_for_task:
17581740
self._logger.warning(
17591741
'Time limit for a single run is higher than total time '
17601742
'limit. Capping the limit for a single run to the total '
@@ -1795,12 +1777,8 @@ def fit_ensemble(
17951777
)
17961778

17971779
manager.build_ensemble(self._dask_client)
1798-
future = manager.futures.pop()
1799-
result = future.result()
1800-
if result is None:
1801-
raise ValueError("Errors occurred while building the ensemble - please"
1802-
" check the log file and command line output for error messages.")
1803-
self.ensemble_performance_history, _, _, _ = result
1780+
if manager is not None:
1781+
self._collect_results_ensemble(manager)
18041782

18051783
if load_models:
18061784
self._load_models()
@@ -1878,6 +1856,31 @@ def _init_ensemble_builder(
18781856

18791857
return proc_ensemble
18801858

1859+
def _collect_results_ensemble(
1860+
self,
1861+
manager: EnsembleBuilderManager
1862+
) -> None:
1863+
1864+
if self._logger is None:
1865+
raise ValueError("logger should be initialized to fit ensemble")
1866+
1867+
self._results_manager.ensemble_performance_history = list(manager.history)
1868+
1869+
if len(manager.futures) > 0:
1870+
# Also add ensemble runs that did not finish within smac time
1871+
# and add them into the ensemble history
1872+
self._logger.info("Ensemble script still running, waiting for it to finish.")
1873+
result = manager.futures.pop().result()
1874+
if result:
1875+
ensemble_history, _, _, _ = result
1876+
self._results_manager.ensemble_performance_history.extend(ensemble_history)
1877+
self._logger.info("Ensemble script finished, continue shutdown.")
1878+
1879+
# save the ensemble performance history file
1880+
if len(self.ensemble_performance_history) > 0:
1881+
pd.DataFrame(self.ensemble_performance_history).to_json(
1882+
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
1883+
18811884
def predict(
18821885
self,
18831886
X_test: np.ndarray,

autoPyTorch/api/tabular_classification.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1515
from autoPyTorch.datasets.resampling_strategy import (
1616
HoldoutValTypes,
17+
CrossValTypes,
1718
ResamplingStrategies,
1819
)
1920
from autoPyTorch.datasets.tabular_dataset import TabularDataset
@@ -88,17 +89,9 @@ def __init__(
8889
output_directory: Optional[str] = None,
8990
delete_tmp_folder_after_terminate: bool = True,
9091
delete_output_folder_after_terminate: bool = True,
91-
<<<<<<< HEAD
9292
include_components: Optional[Dict[str, Any]] = None,
9393
exclude_components: Optional[Dict[str, Any]] = None,
9494
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
95-
=======
96-
include_components: Optional[Dict] = None,
97-
exclude_components: Optional[Dict] = None,
98-
resampling_strategy: Union[CrossValTypes,
99-
HoldoutValTypes,
100-
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
101-
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
10295
resampling_strategy_args: Optional[Dict[str, Any]] = None,
10396
backend: Optional[Backend] = None,
10497
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
@@ -381,6 +374,18 @@ def search(
381374
self
382375
383376
"""
377+
if dataset_name is None:
378+
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
379+
380+
# we have to create a logger for at this point for the validator
381+
self._logger = self._get_logger(dataset_name)
382+
383+
# Create a validator object to make sure that the data provided by
384+
# the user matches the autopytorch requirements
385+
self.InputValidator = TabularInputValidator(
386+
is_classification=True,
387+
logger_port=self._logger_port,
388+
)
384389

385390
self.dataset, self.InputValidator = self._get_dataset_input_validator(
386391
X_train=X_train,
@@ -399,9 +404,9 @@ def search(
399404
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
400405
)
401406

402-
403407
if self.dataset is None:
404408
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
409+
405410
return self._search(
406411
dataset=self.dataset,
407412
optimize_metric=optimize_metric,
@@ -441,23 +446,23 @@ def predict(
441446
raise ValueError("predict() is only supported after calling search. Kindly call first "
442447
"the estimator search() method.")
443448

444-
X_test = self.input_validator.feature_validator.transform(X_test)
449+
X_test = self.InputValidator.feature_validator.transform(X_test)
445450
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
446451
n_jobs=n_jobs)
447452

448-
if self.input_validator.target_validator.is_single_column_target():
453+
if self.InputValidator.target_validator.is_single_column_target():
449454
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
450455
else:
451456
predicted_indexes = (predicted_probabilities > 0.5).astype(int)
452457

453458
# Allow to predict in the original domain -- that is, the user is not interested
454459
# in our encoded values
455-
return self.input_validator.target_validator.inverse_transform(predicted_indexes)
460+
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
456461

457462
def predict_proba(self,
458463
X_test: Union[np.ndarray, pd.DataFrame, List],
459464
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
460-
if self.input_validator is None or not self.input_validator._is_fitted:
465+
if self.InputValidator is None or not self.InputValidator._is_fitted:
461466
raise ValueError("predict() is only supported after calling search. Kindly call first "
462467
"the estimator search() method.")
463468
X_test = self.InputValidator.feature_validator.transform(X_test)

autoPyTorch/api/tabular_regression.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1515
from autoPyTorch.datasets.resampling_strategy import (
1616
HoldoutValTypes,
17+
CrossValTypes,
1718
ResamplingStrategies,
1819
)
1920
from autoPyTorch.datasets.tabular_dataset import TabularDataset
@@ -88,17 +89,9 @@ def __init__(
8889
output_directory: Optional[str] = None,
8990
delete_tmp_folder_after_terminate: bool = True,
9091
delete_output_folder_after_terminate: bool = True,
91-
<<<<<<< HEAD
9292
include_components: Optional[Dict[str, Any]] = None,
9393
exclude_components: Optional[Dict[str, Any]] = None,
9494
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
95-
=======
96-
include_components: Optional[Dict] = None,
97-
exclude_components: Optional[Dict] = None,
98-
resampling_strategy:Union[CrossValTypes,
99-
HoldoutValTypes,
100-
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
101-
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
10295
resampling_strategy_args: Optional[Dict[str, Any]] = None,
10396
backend: Optional[Backend] = None,
10497
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
@@ -398,9 +391,9 @@ def search(
398391
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
399392
)
400393

401-
402394
if self.dataset is None:
403395
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
396+
404397
return self._search(
405398
dataset=self.dataset,
406399
optimize_metric=optimize_metric,
@@ -426,14 +419,14 @@ def predict(
426419
batch_size: Optional[int] = None,
427420
n_jobs: int = 1
428421
) -> np.ndarray:
429-
if self.input_validator is None or not self.input_validator._is_fitted:
422+
if self.InputValidator is None or not self.InputValidator._is_fitted:
430423
raise ValueError("predict() is only supported after calling search. Kindly call first "
431424
"the estimator search() method.")
432425

433-
X_test = self.input_validator.feature_validator.transform(X_test)
426+
X_test = self.InputValidator.feature_validator.transform(X_test)
434427
predicted_values = super().predict(X_test, batch_size=batch_size,
435428
n_jobs=n_jobs)
436429

437430
# Allow to predict in the original domain -- that is, the user is not interested
438431
# in our encoded values
439-
return self.input_validator.target_validator.inverse_transform(predicted_values)
432+
return self.InputValidator.target_validator.inverse_transform(predicted_values)

autoPyTorch/data/base_target_validator.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ def fit(
9898
np.shape(y_test)
9999
))
100100
if isinstance(y_train, pd.DataFrame):
101-
y_train = cast(pd.DataFrame, y_train)
102101
y_test = cast(pd.DataFrame, y_test)
103102
if y_train.columns.tolist() != y_test.columns.tolist():
104103
raise ValueError(

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import functools
2-
from typing import Dict, List, Optional, Tuple, Union, cast
2+
from typing import Dict, List, Optional, Tuple, Type, Union, cast
33

44
import numpy as np
55

@@ -263,7 +263,7 @@ def transform(
263263
X = self.numpy_to_pandas(X)
264264

265265
if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
266-
X = cast(pd.DataFrame, X)
266+
X = cast(Type[pd.DataFrame], X)
267267

268268
# Check the data here so we catch problems on new test data
269269
self._check_data(X)
@@ -391,9 +391,6 @@ def _get_columns_info(
391391
Type of each column numerical/categorical
392392
"""
393393

394-
if len(self.transformed_columns) > 0 and self.feat_type is not None:
395-
return self.transformed_columns, self.feat_type
396-
397394
# Register if a column needs encoding
398395
numerical_columns = []
399396
categorical_columns = []

autoPyTorch/data/tabular_target_validator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, Union, cast
1+
from typing import List, Optional, cast
22

33
import numpy as np
44

autoPyTorch/evaluation/fit_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010

1111
from smac.tae import StatusType
1212

13+
from autoPyTorch.automl_common.common.utils.backend import Backend
1314
from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes
1415
from autoPyTorch.evaluation.abstract_evaluator import (
1516
AbstractEvaluator,
1617
fit_and_suppress_warnings
1718
)
1819
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
19-
from autoPyTorch.utils.backend import Backend
2020
from autoPyTorch.utils.common import subsampler
2121
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
2222

autoPyTorch/optimizer/smbo.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def __init__(self,
105105
resampling_strategy_args: Optional[Dict[str, Any]] = None,
106106
include: Optional[Dict[str, Any]] = None,
107107
exclude: Optional[Dict[str, Any]] = None,
108-
disable_file_output: List = [],
108+
disable_file_output: Union[bool, List[str]] = False,
109109
smac_scenario_args: Optional[Dict[str, Any]] = None,
110110
get_smac_object_callback: Optional[Callable] = None,
111111
all_supported_metrics: bool = True,
@@ -248,6 +248,10 @@ def __init__(self,
248248
if portfolio_selection is not None:
249249
self.initial_configurations = read_return_initial_configurations(config_space=config_space,
250250
portfolio_selection=portfolio_selection)
251+
if len(self.initial_configurations) == 0:
252+
self.initial_configurations = None
253+
self.logger.warning("None of the portfolio configurations are compatible"
254+
" with the current search space. Skipping initial configuration...")
251255

252256
def reset_data_manager(self) -> None:
253257
if self.datamanager is not None:

0 commit comments

Comments
 (0)