Skip to content

Commit 0c8d2ff

Browse files
ArlindKadraravinkohli
authored andcommitted
Cocktail hotfixes (#245)
* Fixes for the development branch and regularization cocktails * Update implementation * Fix unit tests temporarily * Implementation update and bug fixes * Removing unecessary code * Addressing Ravin's comments [refactor] Address Shuhei's comments [refactor] Address Shuhei's comments [refactor] Address Shuhei's comments [refactor] Address Shuhei's comments
1 parent c4b7729 commit 0c8d2ff

File tree

11 files changed

+377
-46
lines changed

11 files changed

+377
-46
lines changed

autoPyTorch/api/base_task.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ def set_pipeline_config(self, **pipeline_config_kwargs: Any) -> None:
459459
None
460460
"""
461461
unknown_keys = []
462-
for option, value in pipeline_config_kwargs.items():
462+
for option in pipeline_config_kwargs.keys():
463463
if option in self.pipeline_options.keys():
464464
pass
465465
else:

autoPyTorch/api/tabular_classification.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -92,17 +92,9 @@ def __init__(
9292
output_directory: Optional[str] = None,
9393
delete_tmp_folder_after_terminate: bool = True,
9494
delete_output_folder_after_terminate: bool = True,
95-
<<<<<<< HEAD
9695
include_components: Optional[Dict[str, Any]] = None,
9796
exclude_components: Optional[Dict[str, Any]] = None,
9897
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
99-
=======
100-
include_components: Optional[Dict] = None,
101-
exclude_components: Optional[Dict] = None,
102-
resampling_strategy: Union[CrossValTypes,
103-
HoldoutValTypes,
104-
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
105-
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
10698
resampling_strategy_args: Optional[Dict[str, Any]] = None,
10799
backend: Optional[Backend] = None,
108100
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None

autoPyTorch/api/tabular_regression.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -93,17 +93,9 @@ def __init__(
9393
output_directory: Optional[str] = None,
9494
delete_tmp_folder_after_terminate: bool = True,
9595
delete_output_folder_after_terminate: bool = True,
96-
<<<<<<< HEAD
9796
include_components: Optional[Dict[str, Any]] = None,
9897
exclude_components: Optional[Dict[str, Any]] = None,
9998
resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation,
100-
=======
101-
include_components: Optional[Dict] = None,
102-
exclude_components: Optional[Dict] = None,
103-
resampling_strategy:Union[CrossValTypes,
104-
HoldoutValTypes,
105-
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
106-
>>>>>>> Create fit evaluator, no resampling strategy and fix bug for test statistics
10799
resampling_strategy_args: Optional[Dict[str, Any]] = None,
108100
backend: Optional[Backend] = None,
109101
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ def _get_columns_to_encode(
468468
feat_types = []
469469

470470
# Make sure each column is a valid type
471-
for i, column in enumerate(X.columns):
471+
for column in X.columns:
472472
if X[column].dtype.name in ['category', 'bool']:
473473

474474
transformed_columns.append(column)
@@ -592,7 +592,7 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
592592
X[key] = X[key].astype(dtype.name)
593593
except Exception as e:
594594
# Try inference if possible
595-
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
595+
self.logger.warning(f'Casting the column {key} to {dtype} caused the exception {e}')
596596
pass
597597
else:
598598
X = X.infer_objects()
Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
import time
2+
from multiprocessing.queues import Queue
3+
from typing import Any, Dict, List, Optional, Tuple, Union
4+
5+
from ConfigSpace.configuration_space import Configuration
6+
7+
import numpy as np
8+
9+
from sklearn.base import BaseEstimator
10+
11+
from smac.tae import StatusType
12+
13+
from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes
14+
from autoPyTorch.evaluation.abstract_evaluator import (
15+
AbstractEvaluator,
16+
fit_and_suppress_warnings
17+
)
18+
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
19+
from autoPyTorch.utils.backend import Backend
20+
from autoPyTorch.utils.common import subsampler
21+
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
22+
23+
24+
class FitEvaluator(AbstractEvaluator):
25+
def __init__(self, backend: Backend, queue: Queue,
26+
metric: autoPyTorchMetric,
27+
budget: float,
28+
budget_type: str = None,
29+
pipeline_config: Optional[Dict[str, Any]] = None,
30+
configuration: Optional[Configuration] = None,
31+
seed: int = 1,
32+
output_y_hat_optimization: bool = False,
33+
num_run: Optional[int] = None,
34+
include: Optional[Dict[str, Any]] = None,
35+
exclude: Optional[Dict[str, Any]] = None,
36+
disable_file_output: Union[bool, List] = False,
37+
init_params: Optional[Dict[str, Any]] = None,
38+
logger_port: Optional[int] = None,
39+
keep_models: Optional[bool] = None,
40+
all_supported_metrics: bool = True,
41+
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None) -> None:
42+
super().__init__(
43+
backend=backend,
44+
queue=queue,
45+
configuration=configuration,
46+
metric=metric,
47+
seed=seed,
48+
output_y_hat_optimization=output_y_hat_optimization,
49+
num_run=num_run,
50+
include=include,
51+
exclude=exclude,
52+
disable_file_output=disable_file_output,
53+
init_params=init_params,
54+
budget=budget,
55+
budget_type=budget_type,
56+
logger_port=logger_port,
57+
all_supported_metrics=all_supported_metrics,
58+
pipeline_config=pipeline_config,
59+
search_space_updates=search_space_updates
60+
)
61+
if not isinstance(self.datamanager.resampling_strategy, NoResamplingStrategyTypes):
62+
raise ValueError(
63+
"FitEvaluator needs to be fitted on the whole dataset and resampling_strategy "
64+
"must be `NoResamplingStrategyTypes`, but got {}".format(
65+
self.datamanager.resampling_strategy
66+
))
67+
68+
self.splits = self.datamanager.splits
69+
self.Y_target: Optional[np.ndarray] = None
70+
self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN
71+
self.pipeline: Optional[BaseEstimator] = None
72+
73+
self.logger.debug("Search space updates :{}".format(self.search_space_updates))
74+
self.keep_models = keep_models
75+
76+
def fit_predict_and_loss(self) -> None:
77+
"""Fit, predict and compute the loss for no resampling strategy"""
78+
assert self.splits is not None, "Can't fit pipeline in {} is datamanager.splits is None" \
79+
.format(self.__class__.__name__)
80+
additional_run_info: Optional[Dict] = None
81+
split_id = 0
82+
self.logger.info("Starting fit {}".format(split_id))
83+
84+
pipeline = self._get_pipeline()
85+
86+
train_split, test_split = self.splits[split_id]
87+
assert test_split is None
88+
self.Y_actual_train = self.y_train[train_split]
89+
y_train_pred, y_valid_pred, y_test_pred = self._fit_and_predict(pipeline, split_id,
90+
train_indices=train_split,
91+
test_indices=test_split,
92+
add_pipeline_to_self=True)
93+
train_loss = self._loss(self.y_train[train_split], y_train_pred)
94+
if y_valid_pred is not None:
95+
loss = self._loss(self.y_valid, y_valid_pred)
96+
elif y_test_pred is not None:
97+
loss = self._loss(self.y_test, y_test_pred)
98+
else:
99+
loss = train_loss
100+
101+
additional_run_info = pipeline.get_additional_run_info() if hasattr(
102+
pipeline, 'get_additional_run_info') else {}
103+
104+
status = StatusType.SUCCESS
105+
106+
self.logger.debug("In train evaluator fit_predict_and_loss, num_run: {} loss:{}".format(
107+
self.num_run,
108+
loss
109+
))
110+
self.finish_up(
111+
loss=loss,
112+
train_loss=train_loss,
113+
valid_pred=y_valid_pred,
114+
test_pred=y_test_pred,
115+
additional_run_info=additional_run_info,
116+
file_output=True,
117+
status=status,
118+
opt_pred=None
119+
)
120+
121+
def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List],
122+
test_indices: None,
123+
add_pipeline_to_self: bool
124+
) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
125+
126+
X = {'train_indices': train_indices,
127+
'val_indices': test_indices,
128+
'split_id': fold,
129+
'num_run': self.num_run,
130+
**self.fit_dictionary} # fit dictionary
131+
y = None
132+
fit_and_suppress_warnings(self.logger, pipeline, X, y)
133+
self.logger.info("Model fitted, now predicting")
134+
(
135+
Y_train_pred,
136+
Y_valid_pred,
137+
Y_test_pred
138+
) = self._predict(
139+
pipeline,
140+
train_indices=train_indices,
141+
)
142+
143+
if add_pipeline_to_self:
144+
self.pipeline = pipeline
145+
146+
return Y_train_pred, Y_valid_pred, Y_test_pred
147+
148+
def _predict(self, pipeline: BaseEstimator,
149+
train_indices: Union[np.ndarray, List]
150+
) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray]]:
151+
152+
train_pred = self.predict_function(subsampler(self.X_train, train_indices), pipeline,
153+
self.y_train[train_indices])
154+
155+
if self.X_valid is not None:
156+
valid_pred = self.predict_function(self.X_valid, pipeline,
157+
self.y_valid)
158+
else:
159+
valid_pred = None
160+
161+
if self.X_test is not None:
162+
test_pred = self.predict_function(self.X_test, pipeline,
163+
self.y_train[train_indices])
164+
else:
165+
test_pred = None
166+
167+
return train_pred, valid_pred, test_pred
168+
169+
def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
170+
valid_pred: Optional[np.ndarray],
171+
test_pred: Optional[np.ndarray], additional_run_info: Optional[Dict],
172+
file_output: bool, status: StatusType,
173+
opt_pred: Optional[np.ndarray]
174+
) -> Optional[Tuple[float, float, int, Dict]]:
175+
"""This function does everything necessary after the fitting is done:
176+
177+
* predicting
178+
* saving the necessary files
179+
We use it as the signal handler so we can recycle the code for the
180+
normal usecase and when the runsolver kills us here :)"""
181+
182+
self.duration = time.time() - self.starttime
183+
184+
if file_output:
185+
loss_, additional_run_info_ = self.file_output(
186+
None, valid_pred, test_pred,
187+
)
188+
else:
189+
loss_ = None
190+
additional_run_info_ = {}
191+
192+
validation_loss, test_loss = self.calculate_auxiliary_losses(
193+
valid_pred, test_pred
194+
)
195+
196+
if loss_ is not None:
197+
return self.duration, loss_, self.seed, additional_run_info_
198+
199+
cost = loss[self.metric.name]
200+
201+
additional_run_info = (
202+
{} if additional_run_info is None else additional_run_info
203+
)
204+
for metric_name, value in loss.items():
205+
additional_run_info[metric_name] = value
206+
additional_run_info['duration'] = self.duration
207+
additional_run_info['num_run'] = self.num_run
208+
if train_loss is not None:
209+
additional_run_info['train_loss'] = train_loss
210+
if validation_loss is not None:
211+
additional_run_info['validation_loss'] = validation_loss
212+
if test_loss is not None:
213+
additional_run_info['test_loss'] = test_loss
214+
215+
rval_dict = {'loss': cost,
216+
'additional_run_info': additional_run_info,
217+
'status': status}
218+
219+
self.queue.put(rval_dict)
220+
return None
221+
222+
def file_output(
223+
self,
224+
Y_optimization_pred: np.ndarray,
225+
Y_valid_pred: np.ndarray,
226+
Y_test_pred: np.ndarray,
227+
) -> Tuple[Optional[float], Dict]:
228+
229+
# Abort if predictions contain NaNs
230+
for y, s in [
231+
[Y_valid_pred, 'validation'],
232+
[Y_test_pred, 'test']
233+
]:
234+
if y is not None and not np.all(np.isfinite(y)):
235+
return (
236+
1.0,
237+
{
238+
'error':
239+
'Model predictions for %s set contains NaNs.' % s
240+
},
241+
)
242+
243+
# Abort if we don't want to output anything.
244+
if hasattr(self, 'disable_file_output'):
245+
if self.disable_file_output:
246+
return None, {}
247+
else:
248+
self.disabled_file_outputs = []
249+
250+
if hasattr(self, 'pipeline') and self.pipeline is not None:
251+
if 'pipeline' not in self.disabled_file_outputs:
252+
pipeline = self.pipeline
253+
else:
254+
pipeline = None
255+
else:
256+
pipeline = None
257+
258+
self.logger.debug("Saving model {}_{}_{} to disk".format(self.seed, self.num_run, self.budget))
259+
self.backend.save_numrun_to_dir(
260+
seed=int(self.seed),
261+
idx=int(self.num_run),
262+
budget=float(self.budget),
263+
model=pipeline,
264+
cv_model=None,
265+
ensemble_predictions=None,
266+
valid_predictions=(
267+
Y_valid_pred if 'y_valid' not in
268+
self.disabled_file_outputs else None
269+
),
270+
test_predictions=(
271+
Y_test_pred if 'y_test' not in
272+
self.disabled_file_outputs else None
273+
),
274+
)
275+
276+
return None, {}
277+
278+
279+
# create closure for evaluating an algorithm
280+
def eval_function(
281+
backend: Backend,
282+
queue: Queue,
283+
metric: autoPyTorchMetric,
284+
budget: float,
285+
config: Optional[Configuration],
286+
seed: int,
287+
num_run: int,
288+
include: Optional[Dict[str, Any]],
289+
exclude: Optional[Dict[str, Any]],
290+
disable_file_output: Union[bool, List],
291+
output_y_hat_optimization: bool = False,
292+
pipeline_config: Optional[Dict[str, Any]] = None,
293+
budget_type: str = None,
294+
init_params: Optional[Dict[str, Any]] = None,
295+
logger_port: Optional[int] = None,
296+
all_supported_metrics: bool = True,
297+
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
298+
instance: str = None,
299+
) -> None:
300+
evaluator = FitEvaluator(
301+
backend=backend,
302+
queue=queue,
303+
metric=metric,
304+
configuration=config,
305+
seed=seed,
306+
num_run=num_run,
307+
output_y_hat_optimization=output_y_hat_optimization,
308+
include=include,
309+
exclude=exclude,
310+
disable_file_output=disable_file_output,
311+
init_params=init_params,
312+
budget=budget,
313+
budget_type=budget_type,
314+
logger_port=logger_port,
315+
all_supported_metrics=all_supported_metrics,
316+
pipeline_config=pipeline_config,
317+
search_space_updates=search_space_updates
318+
)
319+
evaluator.fit_predict_and_loss()

0 commit comments

Comments
 (0)