Skip to content

Commit b176bda

Browse files
committed
[FIX] Tests after rebase of reg_cocktails (#359)
* update requirements * update requirements * resolve remaining conflicts and fix flake and mypy * Fix remaining tests and examples * fix failing checks * fix flake
1 parent 98c93c4 commit b176bda

38 files changed

+294
-1055
lines changed

autoPyTorch/api/base_task.py

Lines changed: 39 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -902,18 +902,15 @@ def run_traditional_ml(
902902
learning algorithm runs over the time limit.
903903
"""
904904
assert self._logger is not None # for mypy compliancy
905-
if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
906-
self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
907-
else:
908-
traditional_task_name = 'runTraditional'
909-
self._stopwatch.start_task(traditional_task_name)
910-
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
911-
time_for_traditional = int(runtime_limit - elapsed_time)
912-
self._do_traditional_prediction(
913-
func_eval_time_limit_secs=func_eval_time_limit_secs,
914-
time_left=time_for_traditional,
915-
)
916-
self._stopwatch.stop_task(traditional_task_name)
905+
traditional_task_name = 'runTraditional'
906+
self._stopwatch.start_task(traditional_task_name)
907+
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
908+
time_for_traditional = int(runtime_limit - elapsed_time)
909+
self._do_traditional_prediction(
910+
func_eval_time_limit_secs=func_eval_time_limit_secs,
911+
time_left=time_for_traditional,
912+
)
913+
self._stopwatch.stop_task(traditional_task_name)
917914

918915
def _search(
919916
self,
@@ -1283,22 +1280,7 @@ def _search(
12831280
self._logger.info("Starting Shutdown")
12841281

12851282
if proc_ensemble is not None:
1286-
self._results_manager.ensemble_performance_history = list(proc_ensemble.history)
1287-
1288-
if len(proc_ensemble.futures) > 0:
1289-
# Also add ensemble runs that did not finish within smac time
1290-
# and add them into the ensemble history
1291-
self._logger.info("Ensemble script still running, waiting for it to finish.")
1292-
result = proc_ensemble.futures.pop().result()
1293-
if result:
1294-
ensemble_history, _, _, _ = result
1295-
self._results_manager.ensemble_performance_history.extend(ensemble_history)
1296-
self._logger.info("Ensemble script finished, continue shutdown.")
1297-
1298-
# save the ensemble performance history file
1299-
if len(self.ensemble_performance_history) > 0:
1300-
pd.DataFrame(self.ensemble_performance_history).to_json(
1301-
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
1283+
self._collect_results_ensemble(proc_ensemble)
13021284

13031285
if load_models:
13041286
self._logger.info("Loading models...")
@@ -1566,7 +1548,7 @@ def fit_pipeline(
15661548
exclude=self.exclude_components,
15671549
search_space_updates=self.search_space_updates)
15681550
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
1569-
self._backend.replace_datamanager(dataset)
1551+
self._backend.save_datamanager(dataset)
15701552

15711553
if self._logger is None:
15721554
self._logger = self._get_logger(dataset.dataset_name)
@@ -1757,7 +1739,7 @@ def fit_ensemble(
17571739
ensemble_fit_task_name = 'EnsembleFit'
17581740
self._stopwatch.start_task(ensemble_fit_task_name)
17591741
if enable_traditional_pipeline:
1760-
if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task:
1742+
if func_eval_time_limit_secs > time_for_task:
17611743
self._logger.warning(
17621744
'Time limit for a single run is higher than total time '
17631745
'limit. Capping the limit for a single run to the total '
@@ -1798,12 +1780,8 @@ def fit_ensemble(
17981780
)
17991781

18001782
manager.build_ensemble(self._dask_client)
1801-
future = manager.futures.pop()
1802-
result = future.result()
1803-
if result is None:
1804-
raise ValueError("Errors occurred while building the ensemble - please"
1805-
" check the log file and command line output for error messages.")
1806-
self.ensemble_performance_history, _, _, _ = result
1783+
if manager is not None:
1784+
self._collect_results_ensemble(manager)
18071785

18081786
if load_models:
18091787
self._load_models()
@@ -1881,6 +1859,31 @@ def _init_ensemble_builder(
18811859

18821860
return proc_ensemble
18831861

1862+
def _collect_results_ensemble(
1863+
self,
1864+
manager: EnsembleBuilderManager
1865+
) -> None:
1866+
1867+
if self._logger is None:
1868+
raise ValueError("logger should be initialized to fit ensemble")
1869+
1870+
self._results_manager.ensemble_performance_history = list(manager.history)
1871+
1872+
if len(manager.futures) > 0:
1873+
# Also add ensemble runs that did not finish within smac time
1874+
# and add them into the ensemble history
1875+
self._logger.info("Ensemble script still running, waiting for it to finish.")
1876+
result = manager.futures.pop().result()
1877+
if result:
1878+
ensemble_history, _, _, _ = result
1879+
self._results_manager.ensemble_performance_history.extend(ensemble_history)
1880+
self._logger.info("Ensemble script finished, continue shutdown.")
1881+
1882+
# save the ensemble performance history file
1883+
if len(self.ensemble_performance_history) > 0:
1884+
pd.DataFrame(self.ensemble_performance_history).to_json(
1885+
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
1886+
18841887
def predict(
18851888
self,
18861889
X_test: np.ndarray,

autoPyTorch/api/tabular_classification.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1818
from autoPyTorch.datasets.resampling_strategy import (
1919
HoldoutValTypes,
20+
CrossValTypes,
2021
ResamplingStrategies,
2122
)
2223
from autoPyTorch.datasets.tabular_dataset import TabularDataset
@@ -423,6 +424,7 @@ def search(
423424

424425
if self.dataset is None:
425426
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
427+
426428
return self._search(
427429
dataset=self.dataset,
428430
optimize_metric=optimize_metric,
@@ -462,23 +464,23 @@ def predict(
462464
raise ValueError("predict() is only supported after calling search. Kindly call first "
463465
"the estimator search() method.")
464466

465-
X_test = self.input_validator.feature_validator.transform(X_test)
467+
X_test = self.InputValidator.feature_validator.transform(X_test)
466468
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
467469
n_jobs=n_jobs)
468470

469-
if self.input_validator.target_validator.is_single_column_target():
471+
if self.InputValidator.target_validator.is_single_column_target():
470472
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
471473
else:
472474
predicted_indexes = (predicted_probabilities > 0.5).astype(int)
473475

474476
# Allow to predict in the original domain -- that is, the user is not interested
475477
# in our encoded values
476-
return self.input_validator.target_validator.inverse_transform(predicted_indexes)
478+
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
477479

478480
def predict_proba(self,
479481
X_test: Union[np.ndarray, pd.DataFrame, List],
480482
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
481-
if self.input_validator is None or not self.input_validator._is_fitted:
483+
if self.InputValidator is None or not self.InputValidator._is_fitted:
482484
raise ValueError("predict() is only supported after calling search. Kindly call first "
483485
"the estimator search() method.")
484486
X_test = self.input_validator.feature_validator.transform(X_test)

autoPyTorch/api/tabular_regression.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1818
from autoPyTorch.datasets.resampling_strategy import (
1919
HoldoutValTypes,
20+
CrossValTypes,
2021
ResamplingStrategies,
2122
)
2223
from autoPyTorch.datasets.tabular_dataset import TabularDataset
@@ -424,6 +425,7 @@ def search(
424425

425426
if self.dataset is None:
426427
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
428+
427429
return self._search(
428430
dataset=self.dataset,
429431
optimize_metric=optimize_metric,
@@ -449,14 +451,14 @@ def predict(
449451
batch_size: Optional[int] = None,
450452
n_jobs: int = 1
451453
) -> np.ndarray:
452-
if self.input_validator is None or not self.input_validator._is_fitted:
454+
if self.InputValidator is None or not self.InputValidator._is_fitted:
453455
raise ValueError("predict() is only supported after calling search. Kindly call first "
454456
"the estimator search() method.")
455457

456-
X_test = self.input_validator.feature_validator.transform(X_test)
458+
X_test = self.InputValidator.feature_validator.transform(X_test)
457459
predicted_values = super().predict(X_test, batch_size=batch_size,
458460
n_jobs=n_jobs)
459461

460462
# Allow to predict in the original domain -- that is, the user is not interested
461463
# in our encoded values
462-
return self.input_validator.target_validator.inverse_transform(predicted_values)
464+
return self.InputValidator.target_validator.inverse_transform(predicted_values)

autoPyTorch/data/base_target_validator.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ def fit(
8585
np.shape(y_test)
8686
))
8787
if isinstance(y_train, pd.DataFrame):
88-
y_train = cast(pd.DataFrame, y_train)
8988
y_test = cast(pd.DataFrame, y_test)
9089
if y_train.columns.tolist() != y_test.columns.tolist():
9190
raise ValueError(

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -277,29 +277,8 @@ def transform(
277277
if isinstance(X, np.ndarray):
278278
X = self.numpy_to_pandas(X)
279279

280-
if hasattr(X, "iloc") and not issparse(X):
281-
X = cast(pd.DataFrame, X)
282-
# If we had null columns in our fit call and we made them numeric, then:
283-
# - If the columns are null even in transform, apply the same procedure.
284-
# - Otherwise, substitute the values with np.NaN and then make the columns numeric.
285-
# If the column is null here, but it was not in fit, it does not matter.
286-
for column in self.null_columns:
287-
# The column is not null, make it null since it was null in fit.
288-
if not X[column].isna().all():
289-
X[column] = np.NaN
290-
X[column] = pd.to_numeric(X[column])
291-
292-
# for the test set, if we have columns with only null values
293-
# they will probably have a numeric type. If these columns were not
294-
# with only null values in the train set, they should be converted
295-
# to the type that they had during fitting.
296-
for column in X.columns:
297-
if X[column].isna().all():
298-
X[column] = X[column].astype(self.dtypes[list(X.columns).index(column)])
299-
300-
# Also remove the object dtype for new data
301-
if not X.select_dtypes(include='object').empty:
302-
X = self.infer_objects(X)
280+
if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
281+
X = cast(Type[pd.DataFrame], X)
303282

304283
# Check the data here so we catch problems on new test data
305284
self._check_data(X)
@@ -458,9 +437,6 @@ def _get_columns_info(
458437
Type of each column numerical/categorical
459438
"""
460439

461-
if len(self.transformed_columns) > 0 and self.feat_type is not None:
462-
return self.transformed_columns, self.feat_type
463-
464440
# Register if a column needs encoding
465441
numerical_columns = []
466442
categorical_columns = []

autoPyTorch/data/tabular_target_validator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, Union, cast
1+
from typing import List, Optional, cast
22

33
import numpy as np
44

autoPyTorch/evaluation/fit_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010

1111
from smac.tae import StatusType
1212

13+
from autoPyTorch.automl_common.common.utils.backend import Backend
1314
from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes
1415
from autoPyTorch.evaluation.abstract_evaluator import (
1516
AbstractEvaluator,
1617
fit_and_suppress_warnings
1718
)
1819
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
19-
from autoPyTorch.utils.backend import Backend
2020
from autoPyTorch.utils.common import subsampler
2121
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
2222

autoPyTorch/optimizer/smbo.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def __init__(self,
104104
resampling_strategy_args: Optional[Dict[str, Any]] = None,
105105
include: Optional[Dict[str, Any]] = None,
106106
exclude: Optional[Dict[str, Any]] = None,
107-
disable_file_output: List = [],
107+
disable_file_output: Union[bool, List[str]] = False,
108108
smac_scenario_args: Optional[Dict[str, Any]] = None,
109109
get_smac_object_callback: Optional[Callable] = None,
110110
all_supported_metrics: bool = True,
@@ -250,6 +250,10 @@ def __init__(self,
250250
self.initial_configurations = initial_configurations \
251251
if len(initial_configurations) > 0 else None
252252

253+
if len(self.initial_configurations) == 0:
254+
self.logger.warning("None of the portfolio configurations are compatible"
255+
" with the current search space. Skipping initial configuration...")
256+
253257
def run_smbo(self, func: Optional[Callable] = None
254258
) -> Tuple[RunHistory, List[TrajEntry], str]:
255259

0 commit comments

Comments
 (0)